From ccac77d888f3a0c2cd5e3d8d2c0e7c13c21596af Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 1 Aug 2012 11:57:16 -0400 Subject: [PATCH] Bugfix for incorrect allele counting in IndelSummary -- Previous version would count all alt alleles as present in a sample, even if only 1 were present, because of the way VariantEval subsetted VCs -- Updated code for subsetting VCs by sample to be clearer about how it handles rederiving alleles -- Update a few pieces of code to get previous correct behavior -- Updated a few MD5s as now ref calls at sites in dbSNP are counted as having a comp sites, and therefore show up in known sites when Novelty strat is on (which I think is correct) -- Walkers that used old subsetting function with true are now using clearer version that does rederive alleles by default --- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../GLBasedSampleSelector.java | 2 +- .../GTBasedSampleSelector.java | 2 +- .../gatk/walkers/varianteval/VariantEval.java | 5 ++++- .../evaluators/TiTvVariantEvaluator.java | 9 ++++++--- .../varianteval/stratifications/Novelty.java | 2 +- .../varianteval/util/VariantEvalUtils.java | 12 +++++------- .../utils/variantcontext/VariantContext.java | 18 +++++++++++++++--- .../VariantEvalIntegrationTest.java | 6 +++--- .../VariantContextBenchmark.java | 2 +- 10 files changed, 38 insertions(+), 22 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index f16deb701..f49e8f8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -288,7 +288,7 @@ public class ReadBackedPhasing extends RodWalker samplesToPhase) { // for ( String sample : samplesToPhase ) // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); // logger.debug("original VC = " + vc); // logger.debug("sub VC = " + subvc); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index e54dc6388..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector { return true; // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) // first subset to the samples - VariantContext subContext = vc.subContextFromSamples(samples, true); + VariantContext subContext = vc.subContextFromSamples(samples); // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java index 0f55524a6..de832b108 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{ if ( samples == null || samples.isEmpty() ) return true; - VariantContext subContext = vc.subContextFromSamples(samples, false); + VariantContext subContext = vc.subContextFromSamples(samples); if ( subContext.isPolymorphicInSamples() ) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 0b395bc62..58cd14737 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -500,7 +500,10 @@ public class VariantEval extends RodWalker implements TreeRedu @Requires({"eval != null", "comp != null"}) private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { - // find all of the matching comps + if ( comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION ) + // if either of these are NO_VARIATION they are LENIENT matches + return EvalCompMatchType.LENIENT; + if ( comp.getType() != eval.getType() ) return EvalCompMatchType.NO_MATCH; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 6c4fcd26d..fe2437976 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -57,9 +57,12 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } } - public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc1 != null) updateTiTv(vc1, false); - if (vc2 != null) updateTiTv(vc2, true); + @Override + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (eval != null) + updateTiTv(eval, false); + if (comp != null) + updateTiTv(comp, true); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java index 693bdf198..2ad08d806 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java @@ -28,7 +28,7 @@ public class Novelty extends VariantStratifier implements StandardStratification final Collection knownComps = tracker.getValues(knowns, ref.getLocus()); for ( final VariantContext c : knownComps ) { // loop over sites, looking for something that matches the type eval - if ( eval.getType() == c.getType() ) { + if ( eval.getType() == c.getType() || eval.getType() == VariantContext.Type.NO_VARIATION ) { return KNOWN_STATES; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 3dcc1f85f..e84b0b10e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -197,7 +197,9 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Set sampleNames) { - return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false)); + // if we want to preserve AC0 sites as polymorphic we need to not rederive alleles + final boolean deriveAlleles = variantEvalWalker.ignoreAC0Sites(); + return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, deriveAlleles)); } public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) { @@ -262,12 +264,8 @@ public class VariantEvalUtils { // First, filter the VariantContext to represent only the samples for evaluation VariantContext vcsub = vc; - if (subsetBySample && vc.hasGenotypes()) { - if ( variantEvalWalker.isSubsettingToSpecificSamples() ) - vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); - else - vcsub = ensureAnnotations(vc, vc); - } + if (subsetBySample && vc.hasGenotypes()) + vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); if ((byFilter || !vcsub.isFiltered())) { addMapping(mapping, VariantEval.getAllSampleName(), vcsub); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 979400350..2211cfe5e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -334,12 +334,14 @@ public class VariantContext implements Feature { // to enable tribble integratio * in this VC is returned as the set of alleles in the subContext, even if * some of those alleles aren't in the samples * + * WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING? + * * @param sampleNames the sample names - * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples + * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default * @return new VariantContext subsetting to just the given samples */ public VariantContext subContextFromSamples(Set sampleNames, final boolean rederiveAllelesFromGenotypes ) { - if ( sampleNames.containsAll(getSampleNames()) ) { + if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) { return this; // fast path when you don't have any work to do } else { VariantContextBuilder builder = new VariantContextBuilder(this); @@ -355,8 +357,18 @@ public class VariantContext implements Feature { // to enable tribble integratio } } + /** + * @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true + * + * @param sampleNames + * @return + */ + public VariantContext subContextFromSamples(final Set sampleNames) { + return subContextFromSamples(sampleNames, true); + } + public VariantContext subContextFromSample(String sampleName) { - return subContextFromSamples(Collections.singleton(sampleName), true); + return subContextFromSamples(Collections.singleton(sampleName)); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d9a91c4c2..94e52c2b9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -34,7 +34,7 @@ import java.util.Arrays; import java.util.List; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval/"; + private static String variantEvalTestDataRoot = privateTestDir + "VariantEval/"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsWithMLEVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.withMLE.vcf"; @@ -122,7 +122,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e62a3bd9914d48e2bb2fb4f5dfc5ebc0") + Arrays.asList("40abbc9be663aed8ee1158f832463ca8") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -144,7 +144,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("087a2d9943c53e7f49663667c3305c7e") + Arrays.asList("106a0e8753e839c0a2c030eb4b165fa9") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java index 7c522eadf..0e5522e3a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -152,7 +152,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { public void run(final VariantContext vc) { if ( samples == null ) samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); - VariantContext sub = vc.subContextFromSamples(samples, true); + VariantContext sub = vc.subContextFromSamples(samples); sub.getNSamples(); } };