From 4398a8b3703954606e298d7f922f964baac53ce2 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 25 Mar 2010 04:53:31 +0000 Subject: [PATCH] Updated. Now uses VariantContext and is truly "variants" to vcf (i.e. not just GELI to vcf). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3074 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/variantstovcf/VariantsToVCF.java | 222 ++++-------------- .../VariantsToVCFIntegrationTest.java | 49 +--- 2 files changed, 57 insertions(+), 214 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java index 7b548ce15..1052cc017 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCF.java @@ -1,202 +1,72 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantstovcf; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.RodGeliText; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.VariantBackedByGenotype; -import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.utils.genotype.vcf.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; -import java.io.File; -import java.io.PrintStream; import java.util.*; /** - * Converts variants from other file formats (anything that implements the Variation interface) to VCF format. + * Converts variants from other file formats to VCF format. */ -public class VariantsToVCF extends RefWalker { - @Argument(fullName="vcfout", shortName="VO", doc="The output VCF file") public File VCF_OUT; - @Argument(fullName="verbose", shortName="V", doc="Show extended output", required=false) public boolean VERBOSE = false; - @Argument(fullName="suppress_multistate_alleles", shortName="SMA", doc="When multi-sample genotypes imply something other than a biallele state, suppress it.", required=false) public boolean SUPPRESS_MULTISTATE = false; +@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class)) +public class VariantsToVCF extends RodWalker { + + @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) + protected String sampleName = null; private VCFWriter vcfwriter = null; - private VCFHeader vcfheader = null; - private TreeMap sampleNames = null; - private static String format = "GT:GQ:DP"; - public void initialize() { - sampleNames = new TreeMap(); - GATKArgumentCollection args = this.getToolkit().getArguments(); + // Don't allow mixed types for now + private EnumSet ALLOWED_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION, VariantContext.Type.INDEL); - for (String rodName : args.RODBindings) { - //out.println(rodName); - String[] rodPieces = rodName.split(","); - String sampleName = rodPieces[0]; - - if (! ( sampleName.equals("dbSNP") || sampleName.equals("interval") ) ) { - sampleNames.put(sampleName, sampleName); - } - - } - - vcfheader = getHeader(args, sampleNames.keySet()); - vcfwriter = new VCFWriter(VCF_OUT); - vcfwriter.writeHeader(vcfheader); - } - - public static VCFHeader getHeader(GATKArgumentCollection args, Set sampleNames) { - - // Don't output the data for now because it kills our unit test MD5s and is optional - // TODO - figure out what to do here - //Calendar cal = Calendar.getInstance(); - //metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH))); - - Set metaData = new HashSet(); - metaData.add(new VCFHeaderLine("source", "VariantsToVCF")); - metaData.add(new VCFHeaderLine("reference", args.referenceFile.getAbsolutePath())); - - Set additionalColumns = new HashSet(); - additionalColumns.add("FORMAT"); - additionalColumns.addAll(sampleNames); - - return new VCFHeader(metaData, additionalColumns); - } - - public boolean filter(RefMetaDataTracker tracker, char ref, AlignmentContext context) { - if (BaseUtils.simpleBaseToBaseIndex(ref) > -1) { - return true; - } - - for (ReferenceOrderedDatum rod : tracker.getAllRods()) { - if (rod != null && sampleNames.keySet().contains(rod.getName().toUpperCase())) { - return true; - } - } - - return false; - } + private String[] ALLOWED_FORMAT_FIELDS = {"GT"}; public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List gt = new ArrayList(); - Map map = new HashMap(); - VCFRecord rec = generateVCFRecord(tracker, ref, context, vcfheader, gt, map, sampleNames, out, SUPPRESS_MULTISTATE, VERBOSE); - if (rec != null) { - vcfwriter.addRecord(rec); - return 1; + if ( tracker == null ) + return 0; + + rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); + + Collection contexts = tracker.getVariantContexts("variant", ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), true, false); + for ( VariantContext vc : contexts ) { + VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), Arrays.asList(ALLOWED_FORMAT_FIELDS), false, false); + if ( dbsnp != null ) + vcf.setID(dbsnp.getRS_ID()); + if ( sampleName != null && vcf.hasGenotypeData() && vcf.getGenotype("variant") != null ) + vcf.getGenotype("variant").setSampleName(sampleName); + writeRecord(vcf); } - return 0; + + return 1; } - public static VCFRecord generateVCFRecord(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, - VCFHeader vcfheader, List gt, Map map, - Map sampleNamesToRods, PrintStream out, boolean SUPPRESS_MULTISTATE, boolean VERBOSE) { - int[] alleleCounts = new int[4]; - int numSNPs = 0; - int numRefs = 0; - int[] alleleNames = {0, 1, 2, 3}; - double snpQual = 0.0; - int refbase = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); - List alts = new ArrayList(); - for (String name : vcfheader.getGenotypeSamples()) { - ReferenceOrderedDatum rod = tracker.lookup(sampleNamesToRods.get(name), null); - if (rod != null) { - Variation av = (Variation) rod; - String lod = String.format("%d", av.getNegLog10PError() > 99 ? 99 : (int) av.getNegLog10PError()); - int depth = 0; + private void writeRecord(VCFRecord rec) { + if ( vcfwriter == null ) { + // setup the header fields + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFHeaderLine("source", "VariantFiltration")); + hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - if (rod instanceof RodGeliText) { - RodGeliText rv = (RodGeliText) rod; - depth = rv.depth; - } - if (!(rod instanceof VariantBackedByGenotype)) - throw new IllegalArgumentException("The passed in variant type must be backed by genotype data"); - Genotype genotype = ((VariantBackedByGenotype) rod).getCalledGenotype(); - List alleles = new ArrayList(); - for (char base : genotype.getBases().toCharArray()) { - alleles.add(new VCFGenotypeEncoding(String.valueOf(base))); - if (base != ref.getBase() && !alts.contains(String.valueOf(base))) alts.add(new VCFGenotypeEncoding(String.valueOf(base))); - } - int allele1 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(0)); - int allele2 = BaseUtils.simpleBaseToBaseIndex(genotype.getBases().charAt(1)); - if (allele1 >= 0 && allele1 != refbase) { - alleleCounts[allele1]++; - } - if (allele2 >= 0 && allele2 != refbase) { - alleleCounts[allele2]++; - } - Map str = new HashMap(); - str.put("GQ", lod); - if (depth > 0) str.put("DP", String.valueOf(depth)); + vcfwriter = new VCFWriter(out); - VCFGenotypeRecord rec = new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED); - for ( Map.Entry entry : str.entrySet() ) - rec.setField(entry.getKey(), entry.getValue()); - gt.add(rec); + TreeSet samples = new TreeSet(); + if ( sampleName != null ) + samples.add(sampleName); + else + samples.addAll(Arrays.asList(rec.getSampleNames())); - numSNPs++; - snpQual += av.getNegLog10PError(); - } else if (BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1) { - List alleles = new ArrayList(); - alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase()))); - alleles.add(new VCFGenotypeEncoding(String.valueOf(ref.getBase()))); - gt.add(new VCFGenotypeRecord(name, alleles, VCFGenotypeRecord.PHASE.UNPHASED)); + vcfwriter.writeHeader(new VCFHeader(hInfo, samples)); - numRefs++; - } } - - if (numSNPs == 0) - return null; - - - Integer[] perm = Utils.SortPermutation(alleleCounts); - int[] sortedCounts = Utils.PermuteArray(alleleCounts, perm); - int[] sortedNames = Utils.PermuteArray(alleleNames, perm); - - rodDbSNP dbsnp = (rodDbSNP) tracker.lookup("dbsnp", null); - - String infoString = String.format("locus=%s ref=%c allele_count=( %c:%d %c:%d %c:%d %c:%d )", - context.getLocation(), - ref.getBase(), - BaseUtils.baseIndexToSimpleBase(sortedNames[0]), sortedCounts[0], - BaseUtils.baseIndexToSimpleBase(sortedNames[1]), sortedCounts[1], - BaseUtils.baseIndexToSimpleBase(sortedNames[2]), sortedCounts[2], - BaseUtils.baseIndexToSimpleBase(sortedNames[3]), sortedCounts[3] - ); - - if (SUPPRESS_MULTISTATE && sortedCounts[2] > 0) { - out.println("[multistate] " + infoString); - return null; - } else { - if (VERBOSE) { - out.println("[locus_info] " + infoString); - } - } - - Map info = new HashMap(); - if (dbsnp != null) info.put("DB","1"); - if (dbsnp != null && dbsnp.isHapmap()) info.put("H2","1"); - - return new VCFRecord(Character.toString(ref.getBase()), - context.getContig(), - (int) context.getPosition(), - (dbsnp == null) ? "." : dbsnp.getRS_ID(), - alts, - snpQual > 99 ? 99 : (int) snpQual, - "0", - info, - format, - gt); - + vcfwriter.addRecord(rec); } public Integer reduceInit() { @@ -208,8 +78,6 @@ public class VariantsToVCF extends RefWalker { } public void onTraversalDone(Integer sum) { - out.println("Processed " + sum + " variants."); - vcfwriter.close(); } } diff --git a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java index b623c25c6..0516eedca 100755 --- a/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/playground/gatk/walkers/variantstovcf/VariantsToVCFIntegrationTest.java @@ -5,7 +5,6 @@ import org.junit.Test; import java.util.List; import java.util.ArrayList; -import java.io.File; /** @@ -18,62 +17,38 @@ import java.io.File; public class VariantsToVCFIntegrationTest extends WalkerTest { - //@Test + @Test public void testVariantsToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("a94c15f2e8905fd3e98301375cf0f42a"); + md5.add("8f32f0efed5d0233cf9292198f4f01d8"); - /** - * the above MD5 was calculated from running the following command: - * - * java -jar ./dist/GenomeAnalysisTK.jar \ - * -R /broad/1KG/reference/human_b36_both.fasta \ - * -T VariantEval \ - * --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \ - * -L 1:10,000,000-11,000,000 \ - * --outerr myVariantEval \ - * --supressDateInformation \ - * --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls - * - */ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + oneKGLocation + "reference/human_b36_both.fasta" + - " --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" + + " -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls" + " -T VariantsToVCF" + " -L 1:10,000,000-11,000,000" + - " --vcfout %s", + " -sample NA123AB" + + " -o %s", 1, // just one output file md5); - List result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst(); + executeTest("testVariantsToVCFUsingGeliInput #1", spec).getFirst(); } - //@Test + @Test public void testGenotypesToVCFUsingGeliInput() { List md5 = new ArrayList(); - md5.add("6b18f33e25edbd2154c17a949656644b"); + md5.add("40cc4d04d9a50043ce1322ea2650c453"); - /** - * the above MD5 was calculated from running the following command: - * - * java -jar ./dist/GenomeAnalysisTK.jar \ - * -R /broad/1KG/reference/human_b36_both.fasta \ - * -T VariantEval \ - * --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod \ - * -L 1:10,000,000-11,000,000 \ - * --outerr myVariantEval \ - * --supressDateInformation \ - * --rodBind eval,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls - * - */ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + oneKGLocation + "reference/human_b36_both.fasta" + - " --rodBind NA123AB,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" + + " -B variant,Variants," + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls" + " -T VariantsToVCF" + " -L 1:10,000,000-11,000,000" + - " --vcfout %s", + " -sample NA123AB" + + " -o %s", 1, // just one output file md5); - List result = executeTest("testVariantsToVCFUsingGeliInput", spec).getFirst(); + executeTest("testVariantsToVCFUsingGeliInput #2", spec).getFirst(); } }