Updated. Now uses VariantContext and is truly "variants" to vcf (i.e. not just GELI to vcf).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3074 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2373a4618f
commit
4398a8b370
|
|
@ -1,202 +1,72 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf;
|
||||
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.RodGeliText;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Variation;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RMD;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Converts variants from other file formats (anything that implements the Variation interface) to VCF format.
|
||||
* Converts variants from other file formats to VCF format.
|
||||
*/
|
||||
public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
||||
@Argument(fullName="vcfout", shortName="VO", doc="The output VCF file") public File VCF_OUT;
|
||||
@Argument(fullName="verbose", shortName="V", doc="Show extended output", required=false) public boolean VERBOSE = false;
|
||||
@Argument(fullName="suppress_multistate_alleles", shortName="SMA", doc="When multi-sample genotypes imply something other than a biallele state, suppress it.", required=false) public boolean SUPPRESS_MULTISTATE = false;
|
||||
@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class))
|
||||
public class VariantsToVCF extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
|
||||
protected String sampleName = null;
|
||||
|
||||
private VCFWriter vcfwriter = null;
|
||||
private VCFHeader vcfheader = null;
|
||||
private TreeMap<String, String> sampleNames = null;
|
||||
private static String format = "GT:GQ:DP";
|
||||
|
||||
public void initialize() {
|
||||
sampleNames = new TreeMap<String, String>();
|
||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||
// Don't allow mixed types for now
|
||||
private EnumSet<VariantContext.Type> ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL);
|
||||
|
||||
for (String rodName : args.RODBindings) {
|
||||
//out.println(rodName);
|
||||
String[] rodPieces = rodName.split(",");
|
||||
String sampleName = rodPieces[0];
|
||||
|
||||
if (! ( sampleName.equals("dbSNP") || sampleName.equals("interval") ) ) {
|
||||
sampleNames.put(sampleName, sampleName);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
vcfheader = getHeader(args, sampleNames.keySet());
|
||||
vcfwriter = new VCFWriter(VCF_OUT);
|
||||
vcfwriter.writeHeader(vcfheader);
|
||||
}
|
||||
|
||||
public static VCFHeader getHeader(GATKArgumentCollection args, Set<String> sampleNames) {
|
||||
|
||||
// Don't output the data for now because it kills our unit test MD5s and is optional
|
||||
// TODO - figure out what to do here
|
||||
//Calendar cal = Calendar.getInstance();
|
||||
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
|
||||
|
||||
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
|
||||
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
|
||||
metaData.add(new VCFHeaderLine("reference", args.referenceFile.getAbsolutePath()));
|
||||
|
||||
Set<String> additionalColumns = new HashSet<String>();
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.addAll(sampleNames);
|
||||
|
||||
return new VCFHeader(metaData, additionalColumns);
|
||||
}
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
if (BaseUtils.simpleBaseToBaseIndex(ref) > -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (ReferenceOrderedDatum rod : tracker.getAllRods()) {
|
||||
if (rod != null && sampleNames.keySet().contains(rod.getName().toUpperCase())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
private String[] ALLOWED_FORMAT_FIELDS = {"GT"};
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
|
||||
Map<VCFHeader.HEADER_FIELDS, String> map = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
VCFRecord rec = generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE);
|
||||
if (rec != null) {
|
||||
vcfwriter.addRecord(rec);
|
||||
return 1;
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
|
||||
Collection<VariantContext> contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false);
|
||||
for ( VariantContext vc : contexts ) {
|
||||
VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false);
|
||||
if ( dbsnp != null )
|
||||
vcf.setID(dbsnp.getRS_ID());
|
||||
if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null )
|
||||
vcf.getGenotype("variant").setSampleName(sampleName);
|
||||
writeRecord(vcf);
|
||||
}
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public static VCFRecord generateVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context,
|
||||
VCFHeader vcfheader, List<VCFGenotypeRecord> gt, Map<VCFHeader.HEADER_FIELDS, String> map,
|
||||
Map<String, String> sampleNamesToRods, PrintStream out, boolean SUPPRESS_MULTISTATE, boolean VERBOSE) {
|
||||
int[] alleleCounts = new int[4];
|
||||
int numSNPs = 0;
|
||||
int numRefs = 0;
|
||||
int[] alleleNames = {0, 1, 2, 3};
|
||||
double snpQual = 0.0;
|
||||
int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
List<VCFGenotypeEncoding> alts = new ArrayList<VCFGenotypeEncoding>();
|
||||
for (String name : vcfheader.getGenotypeSamples()) {
|
||||
ReferenceOrderedDatum rod = tracker.lookup(sampleNamesToRods.get(name), null);
|
||||
if (rod != null) {
|
||||
Variation av = (Variation) rod;
|
||||
String lod = String.format("%d", av.getNegLog10PError() > 99 ? 99 : (int) av.getNegLog10PError());
|
||||
int depth = 0;
|
||||
private void writeRecord(VCFRecord rec) {
|
||||
if ( vcfwriter == null ) {
|
||||
// setup the header fields
|
||||
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
|
||||
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
|
||||
|
||||
if (rod instanceof RodGeliText) {
|
||||
RodGeliText rv = (RodGeliText) rod;
|
||||
depth = rv.depth;
|
||||
}
|
||||
if (!(rod instanceof VariantBackedByGenotype))
|
||||
throw new IllegalArgumentException("The passed in variant type must be backed by genotype data");
|
||||
Genotype genotype = ((VariantBackedByGenotype) rod).getCalledGenotype();
|
||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
for (char base : genotype.getBases().toCharArray()) {
|
||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(base)));
|
||||
if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(new VCFGenotypeEncoding(String.valueOf(base)));
|
||||
}
|
||||
int allele1 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(0));
|
||||
int allele2 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(1));
|
||||
if (allele1 >= 0 && allele1 != refbase) {
|
||||
alleleCounts[allele1]++;
|
||||
}
|
||||
if (allele2 >= 0 && allele2 != refbase) {
|
||||
alleleCounts[allele2]++;
|
||||
}
|
||||
Map<String, String> str = new HashMap<String, String>();
|
||||
str.put("GQ", lod);
|
||||
if (depth > 0) str.put("DP", String.valueOf(depth));
|
||||
vcfwriter = new VCFWriter(out);
|
||||
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED);
|
||||
for ( Map.Entry<String, String> entry : str.entrySet() )
|
||||
rec.setField(entry.getKey(), entry.getValue());
|
||||
gt.add(rec);
|
||||
TreeSet<String> samples = new TreeSet<String>();
|
||||
if ( sampleName != null )
|
||||
samples.add(sampleName);
|
||||
else
|
||||
samples.addAll(Arrays.asList(rec.getSampleNames()));
|
||||
|
||||
numSNPs++;
|
||||
snpQual += av.getNegLog10PError();
|
||||
} else if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1) {
|
||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
|
||||
alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase())));
|
||||
gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED));
|
||||
vcfwriter.writeHeader(new VCFHeader(hInfo, samples));
|
||||
|
||||
numRefs++;
|
||||
}
|
||||
}
|
||||
|
||||
if (numSNPs == 0)
|
||||
return null;
|
||||
|
||||
|
||||
Integer[] perm = Utils.SortPermutation(alleleCounts);
|
||||
int[] sortedCounts = Utils.PermuteArray(alleleCounts, perm);
|
||||
int[] sortedNames = Utils.PermuteArray(alleleNames, perm);
|
||||
|
||||
rodDbSNP dbsnp = (rodDbSNP) tracker.lookup("dbsnp", null);
|
||||
|
||||
String infoString = String.format("locus=%s ref=%c allele_count=( %c:%d %c:%d %c:%d %c:%d )",
|
||||
context.getLocation(),
|
||||
ref.getBase(),
|
||||
BaseUtils.baseIndexToSimpleBase(sortedNames[0]), sortedCounts[0],
|
||||
BaseUtils.baseIndexToSimpleBase(sortedNames[1]), sortedCounts[1],
|
||||
BaseUtils.baseIndexToSimpleBase(sortedNames[2]), sortedCounts[2],
|
||||
BaseUtils.baseIndexToSimpleBase(sortedNames[3]), sortedCounts[3]
|
||||
);
|
||||
|
||||
if (SUPPRESS_MULTISTATE && sortedCounts[2] > 0) {
|
||||
out.println("[multistate] " + infoString);
|
||||
return null;
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
out.println("[locus_info] " + infoString);
|
||||
}
|
||||
}
|
||||
|
||||
Map<String,String> info = new HashMap<String,String>();
|
||||
if (dbsnp != null) info.put("DB","1");
|
||||
if (dbsnp != null && dbsnp.isHapmap()) info.put("H2","1");
|
||||
|
||||
return new VCFRecord(Character.toString(ref.getBase()),
|
||||
context.getContig(),
|
||||
(int) context.getPosition(),
|
||||
(dbsnp == null) ? "." : dbsnp.getRS_ID(),
|
||||
alts,
|
||||
snpQual > 99 ? 99 : (int) snpQual,
|
||||
"0",
|
||||
info,
|
||||
format,
|
||||
gt);
|
||||
|
||||
vcfwriter.addRecord(rec);
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
|
|
@ -208,8 +78,6 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
out.println("Processed " + sum + " variants.");
|
||||
|
||||
vcfwriter.close();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import org.junit.Test;
|
|||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -18,62 +17,38 @@ import java.io.File;
|
|||
public class VariantsToVCFIntegrationTest extends WalkerTest {
|
||||
|
||||
|
||||
//@Test
|
||||
@Test
|
||||
public void testVariantsToVCFUsingGeliInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("a94c15f2e8905fd3e98301375cf0f42a");
|
||||
md5.add("8f32f0efed5d0233cf9292198f4f01d8");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
||||
*
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" +
|
||||
" -T VariantsToVCF" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --vcfout %s",
|
||||
" -sample NA123AB" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
|
||||
executeTest("testVariantsToVCFUsingGeliInput #1", spec).getFirst();
|
||||
}
|
||||
|
||||
//@Test
|
||||
@Test
|
||||
public void testGenotypesToVCFUsingGeliInput() {
|
||||
List<String> md5 = new ArrayList<String>();
|
||||
md5.add("6b18f33e25edbd2154c17a949656644b");
|
||||
md5.add("40cc4d04d9a50043ce1322ea2650c453");
|
||||
|
||||
/**
|
||||
* the above MD5 was calculated from running the following command:
|
||||
*
|
||||
* java -jar ./dist/GenomeAnalysisTK.jar \
|
||||
* -R /broad/1KG/reference/human_b36_both.fasta \
|
||||
* -T VariantEval \
|
||||
* --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \
|
||||
* -L 1:10,000,000-11,000,000 \
|
||||
* --outerr myVariantEval \
|
||||
* --supressDateInformation \
|
||||
* --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
|
||||
*
|
||||
*/
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
||||
" -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" +
|
||||
" -T VariantsToVCF" +
|
||||
" -L 1:10,000,000-11,000,000" +
|
||||
" --vcfout %s",
|
||||
" -sample NA123AB" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
md5);
|
||||
List<File> result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst();
|
||||
executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue