From 934d4b93a2097191ef03414210f5dafc709def5f Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 9 Feb 2010 19:02:25 +0000 Subject: [PATCH] VariantContext to VCF converter. BeagleROD, and phasing of VCF calls. Integration tests galore :-) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2814 348d0f76-0448-11de-a6fe-93d51630548a --- .../contexts/variantcontext/Genotype.java | 1 + .../gatk/refdata/VariantContextAdaptors.java | 18 ++- .../walkers/TestVariantContextWalker.java | 26 ++++ .../MendelianViolationEvaluator.java | 6 +- .../varianteval2/VariantEval2Walker.java | 3 + .../vcftools/BeagleTrioToVCFWalker.java | 129 ++++++++++++++++++ .../walkers/vcftools/VCFToBeagleWalker.java | 114 ++++++++++++++++ .../org/broadinstitute/sting/utils/Utils.java | 10 ++ .../VariantContextIntegrationTest.java | 18 ++- 9 files changed, 318 insertions(+), 7 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/VCFToBeagleWalker.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java index c6c2e7405..224ec6493 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/Genotype.java @@ -108,6 +108,7 @@ public class Genotype { * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) */ public boolean isNoCall() { return getType() == Type.NO_CALL; } + public boolean isCalled() { return getType() != Type.NO_CALL; } public void validate() { // todo -- add validation checking here diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index c9d26ad04..502e139b8 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.gatk.refdata; -import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; -import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeEncoding; -import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; @@ -182,6 +180,16 @@ public class VariantContextAdaptors { return null; // can't handle anything else } + + public static VCFHeader createVCFHeader(Set hInfo, VariantContext vc) { + HashSet names = new LinkedHashSet(); + for ( Genotype g : vc.getGenotypesSortedByName() ) { + names.add(g.getSampleName()); + } + + return new VCFHeader(hInfo == null ? new HashSet() : hInfo, names); + } + public static VCFRecord toVCF(VariantContext vc) { // deal with the reference char referenceBase = 'N'; // by default we'll use N @@ -189,6 +197,10 @@ public class VariantContextAdaptors { referenceBase = (char)vc.getReference().getBases()[0]; } + if ( ! vc.isSNP() ) + // todo -- update the code so it works correctly with indels + throw new StingException("VariantContext -> VCF converter currently doesn't support indels; complain to the GATK team"); + String contig = vc.getLocation().getContig(); long position = vc.getLocation().getStart(); String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : "."; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java index 665367ee4..f12a4dbe9 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/TestVariantContextWalker.java @@ -5,10 +5,16 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.*; import org.broadinstitute.sting.utils.cmdLine.Argument; import java.util.EnumSet; +import java.util.Set; +import java.util.HashSet; +import java.util.List; +import java.io.File; /** * Test routine for new VariantContext object @@ -26,6 +32,17 @@ public class TestVariantContextWalker extends RodWalker { @Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false) boolean printContexts = false; + @Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false) + String outputVCF = null; + + private VCFWriter writer = null; + private boolean wroteHeader = false; + + public void initialize() { + if ( outputVCF != null ) + writer = new VCFWriter(new File(outputVCF)); + } + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( ref == null ) return 0; @@ -34,6 +51,15 @@ public class TestVariantContextWalker extends RodWalker { int n = 0; for (VariantContext vc : tracker.getAllVariantContexts(allowedTypes, context.getLocation(), onlyContextsStartinAtCurrentPosition, takeFirstOnly) ) { + if ( writer != null && n == 0 ) { + if ( ! wroteHeader ) { + writer.writeHeader(VariantContextAdaptors.createVCFHeader(null, vc)); + wroteHeader = true; + } + + writer.addRecord(VariantContextAdaptors.toVCF(vc)); + } + n++; if ( printContexts ) out.printf(" %s%n", vc); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java index a1b35c90d..647a20826 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java @@ -98,7 +98,7 @@ public class MendelianViolationEvaluator extends VariantEvaluator { Genotype dadG = vc.getGenotype(trio.dad); Genotype childG = vc.getGenotype(trio.child); - if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) { + if ( includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG) ) { // all genotypes are good, so let's see if child is a violation if ( isViolation(vc, momG, dadG, childG) ) { @@ -152,6 +152,10 @@ public class MendelianViolationEvaluator extends VariantEvaluator { } } + private boolean includeGenotype(Genotype g) { + return g.getNegLog10PError() > getQThreshold() && g.isCalled(); + } + private enum ViolationType { UNDER_CALL, OVER_CALL } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java index 0296170ab..37c10b488 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -196,6 +196,9 @@ public class VariantEval2Walker extends RodWalker { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { //System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases()); + if ( ref == null ) + return 0; + Map comps = getCompVariantContexts(tracker, context); // to enable walking over pairs where eval or comps have no elements diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java new file mode 100755 index 000000000..15c590e71 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/BeagleTrioToVCFWalker.java @@ -0,0 +1,129 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord; +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader; +import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator; + +import java.util.*; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Test routine for new VariantContext object + */ +@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class), @RMD(name="beagle",type=BeagleROD.class)}) +public class BeagleTrioToVCFWalker extends RodWalker { + @Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false) + protected String TRIO_STRUCTURE; + + @Argument(shortName="eth", fullName="excludeTripleHets", doc="If provide, sites that are triple hets calls will not be phased, regardless of Beagle's value", required=false) + protected boolean dontPhaseTripleHets = false; + + private MendelianViolationEvaluator.TrioStructure trio = null; + + private VCFWriter writer; + private boolean headerWritten = false; + private final static String TRACK_NAME = "variants"; + private final static String BEAGLE_NAME = "beagle"; + + public void initialize() { + trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE); + writer = new VCFWriter(out); + } + + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + VariantContext vc = null; + + if ( ref != null ) { + vc = tracker.getVariantContext(TRACK_NAME, null, context.getLocation(), false); + BeagleROD beagle = (BeagleROD)tracker.lookup(BEAGLE_NAME, null); + + if ( vc != null ) { + if ( ! headerWritten ) { + RodVCF vcfrod = (RodVCF)tracker.lookup(TRACK_NAME, null); + writer.writeHeader(vcfrod.getHeader()); + headerWritten = true; + } + + //System.out.printf("VCF: %s%n", tracker.lookup(TRACK_NAME, null)); + vc = maybePhaseVC(vc, beagle); + } + } + + return vc; + } + + private VariantContext maybePhaseVC(VariantContext unphased, BeagleROD beagle) { + if ( beagle == null ) { + return unphased; + } else { + Map> bglData = beagle.getGenotypes(); + List momBgl = bglData.get(trio.mom); + List dadBgl = bglData.get(trio.dad); + + Genotype unphasedMom = unphased.getGenotype(trio.mom); + Genotype unphasedDad = unphased.getGenotype(trio.dad); + Genotype unphasedKid = unphased.getGenotype(trio.child); + + if ( dontPhaseTripleHets && unphasedMom.isHet() && unphasedDad.isHet() && unphasedKid.isHet() ) + return unphased; + else { + Allele momTrans = unphased.getAllele(momBgl.get(0)); + Allele momUntrans = unphased.getAllele(momBgl.get(1)); + Allele dadTrans = unphased.getAllele(dadBgl.get(0)); + Allele dadUntrans = unphased.getAllele(dadBgl.get(1)); + + Genotype momG = phaseGenotype(unphasedMom, Arrays.asList(momTrans, momUntrans)); + Genotype dadG = phaseGenotype(unphasedDad, Arrays.asList(dadTrans, dadUntrans)); + Genotype kidG = phaseGenotype(unphasedKid, Arrays.asList(momTrans, dadTrans)); + + return new VariantContext(unphased.getName(), unphased.getLocation(), unphased.getAlleles(), + Arrays.asList(momG, dadG, kidG), unphased.getNegLog10PError(), unphased.getFilters(), unphased.getAttributes()); + } + } + } + + private Genotype phaseGenotype(Genotype base, List alleles) { + return new Genotype(base.getSampleName(), alleles, base.getNegLog10PError(), base.getFilters(), base.getAttributes(), true); + } + + public Long reduceInit() { + return 0L; + } + + public Integer reduce(VariantContext point, Integer sum) { + return sum; + } + + public void onTraversalDone(Integer result) { + //logger.info(String.format("Saw %d raw SNPs", result.nVariants)); + //logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants)); + } + + public Long reduce(VariantContext vc, Long prevReduce) { + if ( vc == null ) { + return prevReduce; + } else { + writer.addRecord(VariantContextAdaptors.toVCF(vc)); + prevReduce++; + } + + return prevReduce; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/VCFToBeagleWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/VCFToBeagleWalker.java new file mode 100755 index 000000000..0acfb475c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/vcftools/VCFToBeagleWalker.java @@ -0,0 +1,114 @@ +package org.broadinstitute.sting.oneoffprojects.walkers.vcftools; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator; + +import java.util.EnumSet; +import java.util.Arrays; + +/** + * Test routine for new VariantContext object + */ +@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="variants",type=ReferenceOrderedDatum.class)}) +public class VCFToBeagleWalker extends RodWalker { + @Argument(shortName="trio", doc="If provide, treats the input VCF as a single record containing genotypes for a single trio; String formatted as dad+mom=child", required=false) + protected String TRIO_STRUCTURE; + + private MendelianViolationEvaluator.TrioStructure trio = null; + + public class Result { + int nVariants, nConverted; + } + + public void initialize() { + if ( TRIO_STRUCTURE != null ) { + trio = MendelianViolationEvaluator.parseTrioDescription(TRIO_STRUCTURE); + out.printf("I id %s%n", Utils.join(" ", Arrays.asList(trio.mom, trio.mom, trio.dad, trio.dad, trio.child, trio.child))); + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( ref != null ) { + EnumSet allowedTypes = EnumSet.of(VariantContext.Type.SNP); + + VariantContext vc = tracker.getVariantContext("variants", allowedTypes, context.getLocation(), false); + + if ( vc != null && vc.isBiallelic() && vc.isNotFiltered() ) { + if ( trio != null ) { // we are emitting a trio file + if ( ! vc.hasGenotypes() || vc.getGenotypes().size() != 3 ) + throw new StingException("Convertion exception: Trio conversion requires exactly three genotypes at every locus: " + vc); + + if ( genotypesAreGood(vc) ) { + if ( ! genotypesAreGoodForTrios(vc, trio) ) { + logger.debug("VC excluded due to poor trio genotyping " + vc); + } else { + Genotype mom = vc.getGenotype(trio.mom); + Genotype dad = vc.getGenotype(trio.dad); + Genotype child = vc.getGenotype(trio.child); + + // beagle format looks like: + // + // I id 1001 1001 1002 1002 1003 1003 + // A diabetes 1 1 2 2 2 2 + // M rs2289311 A G G G A G + String loc = "c" + vc.getLocation().getContig() + "_p" + vc.getLocation().getStart(); + out.printf("M %s %s %s %s%n", loc, genotype2BeagleString(mom), genotype2BeagleString(dad), genotype2BeagleString(child)); + return 1; + } + } + } else { + throw new IllegalArgumentException("VCFToBeagle currently only supports conversion of trios. Complain to mark"); + } + } + } + + return 0; + } + + private String genotype2BeagleString(Genotype g) { + return allele2BeagleString(g.getAllele(0)) + " " + allele2BeagleString(g.getAllele(1)); + } + + private String allele2BeagleString(Allele a) { + return new String(a.getBases()); + } + + private static boolean genotypesAreGood(VariantContext vc) { + for ( Genotype g : vc.getGenotypes().values() ) { + if ( g.isFiltered() ) + return false; + } + + return true; + } + + private static boolean genotypesAreGoodForTrios(VariantContext vc, MendelianViolationEvaluator.TrioStructure trio) { + return ! MendelianViolationEvaluator.isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child)); + } + + public Result reduceInit() { + return new Result(); + } + + public Result reduce(Integer point, Result sum) { + sum.nVariants++; + sum.nConverted += point; + return sum; + } + + public void onTraversalDone(Result result) { + logger.info(String.format("Saw %d raw SNPs", result.nVariants)); + logger.info(String.format("Converted %d (%.2f%%) of these sites", result.nConverted, (100.0 * result.nConverted) / result.nVariants)); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 8d44c08fc..527e68146 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -568,14 +568,24 @@ public class Utils { } public static > List sorted(Collection c) { + return sorted(c, false); + } + + public static > List sorted(Collection c, boolean reverse) { List l = new ArrayList(c); Collections.sort(l); + if ( reverse ) Collections.reverse(l); return l; } public static , V> List sorted(Map c) { + return sorted(c, false); + } + + public static , V> List sorted(Map c, boolean reverse) { List t = new ArrayList(c.keySet()); Collections.sort(t); + if ( reverse ) Collections.reverse(t); List l = new ArrayList(); for ( T k : t ) { diff --git a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java index 0bc6c97eb..545f68974 100755 --- a/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextIntegrationTest.java @@ -10,11 +10,13 @@ import java.util.List; import java.io.File; public class VariantContextIntegrationTest extends WalkerTest { - private static String root = "-T TestVariantContext" + - " -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + - " -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + + private static String cmdRoot = "-T TestVariantContext" + " -R " + oneKGLocation + "reference/human_b36_both.fasta"; + private static String root = cmdRoot + + " -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + + " -B vcf,VCF,/humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf"; + static HashMap expectations = new HashMap(); static { expectations.put("-L 1:1-10000 --printPerLocus", "eb5802e7e615fa79b788a9447b7e824c"); @@ -42,6 +44,16 @@ public class VariantContextIntegrationTest extends WalkerTest { } } + @Test + public void testToVCF() { + // this really just tests that we are seeing the same number of objects over all of chr1 + + WalkerTestSpec spec = new WalkerTestSpec( cmdRoot + " -B vcf,VCF," + validationDataLocation + "/yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -L 1:1-1000000 -o %s --outputVCF %s", + 2, // just one output file + Arrays.asList("e3c35d0c4b5d4935c84a270f9df0951f", "461960b26ee1f8998ccc47da9bd3913c")); + executeTest("testToVCF", spec); + } + @Test public void testLargeScaleConversion() { // this really just tests that we are seeing the same number of objects over all of chr1