From 327169b28360e24c025d98ad6967ff05d798ff2b Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Tue, 15 Jan 2013 12:13:45 -0500 Subject: [PATCH] Refactor the method that identifies the site overlap type into the type enum class (so it can be used elsewhere potentially). Completed todo item: for sites like (eval) 20 12345 A C 20 12345 A AC (comp) 20 12345 A C 20 12345 A ACCC the records will be matched by the presence of a non-empty intersection of alleles. Any leftover records are then paired with an empty variant context (as though the call was unique). This has one somewhat counterintuitive feature, which is that normally (eval) 20 12345 A AC (comp) 20 12345 A ACCC would be classified as 'ALLELES_DO_NOT_MATCH' (and not counted in genotype tables), in the presence of the SNP, they're counted as EVAL_ONLY and TRUTH_ONLY respectively. + integration test --- .../variantutils/ConcordanceMetrics.java | 40 +++++---- .../variantutils/GenotypeConcordance.java | 86 +++++++++++++++---- .../GenotypeConcordanceIntegrationTest.java | 11 +++ 3 files changed, 103 insertions(+), 34 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java index bb76006bf..8a87c9957 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetrics.java @@ -272,22 +272,7 @@ public class ConcordanceMetrics { @Requires({"evalVC != null","truthVC != null"}) private SiteConcordanceType getMatchType(VariantContext evalVC, VariantContext truthVC) { - if ( evalVC.isMonomorphicInSamples() ) - return SiteConcordanceType.TRUTH_ONLY; - if ( truthVC.isMonomorphicInSamples() ) - return SiteConcordanceType.EVAL_ONLY; - - boolean evalSusbsetTruth = VariantContextUtils.allelesAreSubset(evalVC,truthVC); - boolean truthSubsetEval = VariantContextUtils.allelesAreSubset(truthVC,evalVC); - - if ( evalSusbsetTruth && truthSubsetEval ) - return SiteConcordanceType.ALLELES_MATCH; - else if ( evalSusbsetTruth ) - return SiteConcordanceType.EVAL_SUBSET_TRUTH; - else if ( truthSubsetEval ) - return SiteConcordanceType.EVAL_SUPERSET_TRUTH; - - return SiteConcordanceType.ALLELES_DO_NOT_MATCH; + return SiteConcordanceType.getConcordanceType(evalVC,truthVC); } public int[] getSiteConcordance() { @@ -305,6 +290,27 @@ public class ConcordanceMetrics { EVAL_SUBSET_TRUTH, ALLELES_DO_NOT_MATCH, EVAL_ONLY, - TRUTH_ONLY + TRUTH_ONLY; + + public static SiteConcordanceType getConcordanceType(VariantContext eval, VariantContext truth) { + if ( eval.isMonomorphicInSamples() ) + return TRUTH_ONLY; + if ( truth.isMonomorphicInSamples() ) + return EVAL_ONLY; + + boolean evalSubsetTruth = VariantContextUtils.allelesAreSubset(eval,truth); + boolean truthSubsetEval = VariantContextUtils.allelesAreSubset(eval,truth); + + if ( evalSubsetTruth && truthSubsetEval ) + return ALLELES_MATCH; + + if ( evalSubsetTruth ) + return EVAL_SUBSET_TRUTH; + + if ( truthSubsetEval ) + return EVAL_SUPERSET_TRUTH; + + return ALLELES_DO_NOT_MATCH; + } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java index 07c9a0d77..0cd1882df 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -64,7 +64,7 @@ import java.util.*; /** * A simple walker for performing genotype concordance calculations between two callsets */ -public class GenotypeConcordance extends RodWalker,ConcordanceMetrics> { +public class GenotypeConcordance extends RodWalker>,ConcordanceMetrics> { @Input(fullName="eval",shortName="eval",doc="The variants and genotypes to evaluate",required=true) RodBinding evalBinding; @@ -81,7 +81,6 @@ public class GenotypeConcordance extends RodWalker evalSamples; List compSamples; - // todo -- integration test coverage // todo -- deal with occurrences like: // Eval: 20 4000 A C // Eval: 20 4000 A AC @@ -89,6 +88,7 @@ public class GenotypeConcordance extends RodWalker map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - Pair evalCompPair = null; + public List> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + List> evalCompPair = new ArrayList>(3); if ( tracker != null && ( tracker.getValues(evalBinding,ref.getLocus()).size() > 0 || - tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { + tracker.getValues(compBinding,ref.getLocus()).size() > 0 ) ) { List eval = tracker.getValues(evalBinding,ref.getLocus()); List comp = tracker.getValues(compBinding,ref.getLocus()); if ( eval.size() > 1 || comp.size() > 1 ) { - logger.warn("Eval or Comp Rod at position "+ref.getLocus().toString()+" has multiple records. Site will be skipped."); - return evalCompPair; + logger.info("Eval or Comp Rod at position " + ref.getLocus().toString() + " has multiple records. Resolving."); + evalCompPair = resolveMultipleRecords(eval,comp); + } else { + // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. + // note that if there is no eval rod there must be a comp rod, and also the reverse + VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(comp.get(0),evalSamples); + VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(eval.get(0),compSamples); + evalContext = filterGenotypes(evalContext,ignoreFilters); + compContext = filterGenotypes(compContext,ignoreFilters); + evalCompPair.add(new Pair(evalContext,compContext)); } - // if a rod is missing, explicitly create a variant context with 'missing' genotypes. Slow, but correct. - // note that if there is no eval rod there must be a comp rod, and also the reverse - VariantContext evalContext = eval.size() == 1 ? eval.get(0) : createEmptyContext(ref,comp.get(0),evalSamples); - VariantContext compContext = comp.size() == 1 ? comp.get(0) : createEmptyContext(ref,eval.get(0),compSamples); - evalContext = filterGenotypes(evalContext,ignoreFilters); - compContext = filterGenotypes(compContext,ignoreFilters); - evalCompPair = new Pair(evalContext,compContext); } return evalCompPair; } - public ConcordanceMetrics reduce(Pair evalComp, ConcordanceMetrics metrics) { - if ( evalComp != null ) + /** + * The point of this method is to match up pairs of evals and comps by their alternate alleles. Basically multiple records could + * exist for a site such as: + * Eval: 20 4000 A C + * Eval: 20 4000 A AC + * Comp: 20 4000 A C + * So for each eval, loop through the comps. If the eval alleles (non-emptily) intersect the comp alleles, pair them up and remove + * that comp records. Continue until we're out of evals or comps. This is n^2, but should rarely actually happen. + * + * The remaining unpaired records get paird with an empty contexts. So in the example above we'd get a list of: + * 1 - (20,4000,A/C | 20,4000,A/C) + * 2 - (20,4000,A/AC | Empty ) + * @param evalList - list of eval variant contexts + * @param compList - list of comp variant contexts + * @return resolved pairs of the input lists + */ + private List> resolveMultipleRecords(List evalList, List compList) { + List> resolvedPairs = new ArrayList>(evalList.size()+compList.size()); // oversized but w/e + List pairedEval = new ArrayList(evalList.size()); + for ( VariantContext eval : evalList ) { + Set evalAlts = new HashSet(eval.getAlternateAlleles()); + VariantContext pairedComp = null; + for ( VariantContext comp : compList ) { + for ( Allele compAlt : comp.getAlternateAlleles() ) { + if ( evalAlts.contains(compAlt) ) { + // matching alt allele, pair these records + pairedComp = comp; + break; + } + } + } + if ( pairedComp != null ) { + compList.remove(pairedComp); + resolvedPairs.add(new Pair(eval,pairedComp)); + pairedEval.add(eval); + if ( compList.size() < 1 ) + break; + } + } + evalList.removeAll(pairedEval); + for ( VariantContext unpairedEval : evalList ) { + resolvedPairs.add(new Pair(unpairedEval,createEmptyContext(unpairedEval,compSamples))); + } + + for ( VariantContext unpairedComp : compList ) { + resolvedPairs.add(new Pair(createEmptyContext(unpairedComp,evalSamples),unpairedComp)); + } + + return resolvedPairs; + } + + public ConcordanceMetrics reduce(List> evalCompList, ConcordanceMetrics metrics) { + for ( Pair evalComp : evalCompList) metrics.update(evalComp.getFirst(),evalComp.getSecond()); return metrics; } @@ -233,7 +285,7 @@ public class GenotypeConcordance extends RodWalker samples) { + public VariantContext createEmptyContext(VariantContext other, List samples) { VariantContextBuilder builder = new VariantContextBuilder(); // set the alleles to be the same builder.alleles(other.getAlleles()); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java index d4d8d6f8c..113f098e3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordanceIntegrationTest.java @@ -60,4 +60,15 @@ public class GenotypeConcordanceIntegrationTest extends WalkerTest { executeTest("test non-overlapping samples", spec); } + + @Test + public void testMultipleRecordsPerSite() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("GenotypeConcordance.multipleRecordsTest1.eval.vcf","GenotypeConcordance.multipleRecordsTest1.comp.vcf"), + 0, + Arrays.asList("fdf2cac15775c613f596c27247a76570") + ); + + executeTest("test multiple records per site",spec); + } }