Bugfix for incorrect allele counting in IndelSummary

-- Previous version would count all alt alleles as present in a sample, even if only 1 were present, because of the way VariantEval subsetted VCs
-- Updated code for subsetting VCs by sample to be clearer about how it handles rederiving alleles
-- Update a few pieces of code to get previous correct behavior
-- Updated a few MD5s as now ref calls at sites in dbSNP are counted as having a comp sites, and therefore show up in known sites when Novelty strat is on (which I think is correct)
-- Walkers that used old subsetting function with true are now using clearer version that does rederive alleles by default
This commit is contained in:
Mark DePristo 2012-08-01 11:57:16 -04:00
parent 2b25df3d53
commit ccac77d888
10 changed files with 38 additions and 22 deletions

View File

@ -288,7 +288,7 @@ public class ReadBackedPhasing extends RodWalker<PhasingStatsAndOutput, PhasingS
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) { private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
// for ( String sample : samplesToPhase ) // for ( String sample : samplesToPhase )
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true); VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
// logger.debug("original VC = " + vc); // logger.debug("original VC = " + vc);
// logger.debug("sub VC = " + subvc); // logger.debug("sub VC = " + subvc);
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);

View File

@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector {
return true; return true;
// want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
// first subset to the samples // first subset to the samples
VariantContext subContext = vc.subContextFromSamples(samples, true); VariantContext subContext = vc.subContextFromSamples(samples);
// now check to see (using EXACT model) whether this should be variant // now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec? // do we want to apply a prior? maybe user-spec?

View File

@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{
if ( samples == null || samples.isEmpty() ) if ( samples == null || samples.isEmpty() )
return true; return true;
VariantContext subContext = vc.subContextFromSamples(samples, false); VariantContext subContext = vc.subContextFromSamples(samples);
if ( subContext.isPolymorphicInSamples() ) { if ( subContext.isPolymorphicInSamples() ) {
return true; return true;
} }

View File

@ -500,7 +500,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
@Requires({"eval != null", "comp != null"}) @Requires({"eval != null", "comp != null"})
private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) {
// find all of the matching comps if ( comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION )
// if either of these are NO_VARIATION they are LENIENT matches
return EvalCompMatchType.LENIENT;
if ( comp.getType() != eval.getType() ) if ( comp.getType() != eval.getType() )
return EvalCompMatchType.NO_MATCH; return EvalCompMatchType.NO_MATCH;

View File

@ -57,9 +57,12 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
} }
} }
public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @Override
if (vc1 != null) updateTiTv(vc1, false); public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if (vc2 != null) updateTiTv(vc2, true); if (eval != null)
updateTiTv(eval, false);
if (comp != null)
updateTiTv(comp, true);
} }
@Override @Override

View File

@ -28,7 +28,7 @@ public class Novelty extends VariantStratifier implements StandardStratification
final Collection<VariantContext> knownComps = tracker.getValues(knowns, ref.getLocus()); final Collection<VariantContext> knownComps = tracker.getValues(knowns, ref.getLocus());
for ( final VariantContext c : knownComps ) { for ( final VariantContext c : knownComps ) {
// loop over sites, looking for something that matches the type eval // loop over sites, looking for something that matches the type eval
if ( eval.getType() == c.getType() ) { if ( eval.getType() == c.getType() || eval.getType() == VariantContext.Type.NO_VARIATION ) {
return KNOWN_STATES; return KNOWN_STATES;
} }
} }

View File

@ -197,7 +197,9 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples * @return a new VariantContext with just the requested samples
*/ */
public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) { public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false)); // if we want to preserve AC0 sites as polymorphic we need to not rederive alleles
final boolean deriveAlleles = variantEvalWalker.ignoreAC0Sites();
return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, deriveAlleles));
} }
public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) { public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) {
@ -262,12 +264,8 @@ public class VariantEvalUtils {
// First, filter the VariantContext to represent only the samples for evaluation // First, filter the VariantContext to represent only the samples for evaluation
VariantContext vcsub = vc; VariantContext vcsub = vc;
if (subsetBySample && vc.hasGenotypes()) { if (subsetBySample && vc.hasGenotypes())
if ( variantEvalWalker.isSubsettingToSpecificSamples() ) vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
else
vcsub = ensureAnnotations(vc, vc);
}
if ((byFilter || !vcsub.isFiltered())) { if ((byFilter || !vcsub.isFiltered())) {
addMapping(mapping, VariantEval.getAllSampleName(), vcsub); addMapping(mapping, VariantEval.getAllSampleName(), vcsub);

View File

@ -334,12 +334,14 @@ public class VariantContext implements Feature { // to enable tribble integratio
* in this VC is returned as the set of alleles in the subContext, even if * in this VC is returned as the set of alleles in the subContext, even if
* some of those alleles aren't in the samples * some of those alleles aren't in the samples
* *
* WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING?
*
* @param sampleNames the sample names * @param sampleNames the sample names
* @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default
* @return new VariantContext subsetting to just the given samples * @return new VariantContext subsetting to just the given samples
*/ */
public VariantContext subContextFromSamples(Set<String> sampleNames, final boolean rederiveAllelesFromGenotypes ) { public VariantContext subContextFromSamples(Set<String> sampleNames, final boolean rederiveAllelesFromGenotypes ) {
if ( sampleNames.containsAll(getSampleNames()) ) { if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) {
return this; // fast path when you don't have any work to do return this; // fast path when you don't have any work to do
} else { } else {
VariantContextBuilder builder = new VariantContextBuilder(this); VariantContextBuilder builder = new VariantContextBuilder(this);
@ -355,8 +357,18 @@ public class VariantContext implements Feature { // to enable tribble integratio
} }
} }
/**
* @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true
*
* @param sampleNames
* @return
*/
public VariantContext subContextFromSamples(final Set<String> sampleNames) {
return subContextFromSamples(sampleNames, true);
}
public VariantContext subContextFromSample(String sampleName) { public VariantContext subContextFromSample(String sampleName) {
return subContextFromSamples(Collections.singleton(sampleName), true); return subContextFromSamples(Collections.singleton(sampleName));
} }
/** /**

View File

@ -34,7 +34,7 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
public class VariantEvalIntegrationTest extends WalkerTest { public class VariantEvalIntegrationTest extends WalkerTest {
private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval/"; private static String variantEvalTestDataRoot = privateTestDir + "VariantEval/";
private static String fundamentalTestVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.vcf";
private static String fundamentalTestSNPsWithMLEVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.withMLE.vcf"; private static String fundamentalTestSNPsWithMLEVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.withMLE.vcf";
@ -122,7 +122,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s" "-o %s"
), ),
1, 1,
Arrays.asList("e62a3bd9914d48e2bb2fb4f5dfc5ebc0") Arrays.asList("40abbc9be663aed8ee1158f832463ca8")
); );
executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec);
} }
@ -144,7 +144,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s" "-o %s"
), ),
1, 1,
Arrays.asList("087a2d9943c53e7f49663667c3305c7e") Arrays.asList("106a0e8753e839c0a2c030eb4b165fa9")
); );
executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec);
} }

View File

@ -152,7 +152,7 @@ public class VariantContextBenchmark extends SimpleBenchmark {
public void run(final VariantContext vc) { public void run(final VariantContext vc) {
if ( samples == null ) if ( samples == null )
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake)); samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
VariantContext sub = vc.subContextFromSamples(samples, true); VariantContext sub = vc.subContextFromSamples(samples);
sub.getNSamples(); sub.getNSamples();
} }
}; };