Fixed an issue where a no-call in the eval track would prevent a site from a comparison track from being loaded. Added a new test to cover the use case.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5169 348d0f76-0448-11de-a6fe-93d51630548a
2011-02-02 01:47:53 +00:00 · 2011-02-02 01:47:53 +00:00 · fd8dd8fb9b
parent aab0ec209b
commit fd8dd8fb9b
5 changed files with 87 additions and 75 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -100,7 +100,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    private Set<String> compNames = new TreeSet<String>();
    private Set<String> knownNames = new TreeSet<String>();
    private Set<String> evalNames = new TreeSet<String>();
-    private Set<String> sampleNames = new TreeSet<String>();
+
+    private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
+    private Set<String> sampleNamesForStratification = new TreeSet<String>();
    private int numSamples = 0;

    // The list of stratifiers and evaluators to use
@ -186,7 +188,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr

                try {
                    VariantStratifier vs = c.newInstance();
-                    vs.initialize(jexlExpressions, compNames, knownNames, evalNames, sampleNames);
+                    vs.initialize(jexlExpressions, compNames, knownNames, evalNames, sampleNamesForStratification);

                    strats.add(vs);
                } catch (InstantiationException e) {
@ -372,15 +374,14 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames);
        Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

-        // If we're not using the per-sample stratification, don't bother loading the sample list
-        if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
-            sampleNames.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
-            numSamples = sampleNames.size();
-        } else {
-            numSamples = vcfSamples.size();
-        }
+        // Load the sample list
+        sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
+        numSamples = sampleNamesForEvaluation.size();

-        sampleNames.add(ALL_SAMPLE_NAME);
+        if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
+            sampleNamesForStratification.addAll(sampleNamesForEvaluation);
+        }
+        sampleNamesForStratification.add(ALL_SAMPLE_NAME);

        // Initialize select expressions
        jexlExpressions.addAll(VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS));
@ -411,21 +412,30 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
     *
     * @param tracker    the reference metadata tracker
     * @param ref        the reference context
+     * @param compNames  the comp track names
     * @param evalNames  the evaluation track names
     * @return  the set of allowable variation types
     */
-    private EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> evalNames) {
+    private EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
        EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION);

        if (tracker != null) {
-            Collection<VariantContext> vcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);
+            Collection<VariantContext> evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);

-            for ( VariantContext vc : vcs ) {
+            for ( VariantContext vc : evalvcs ) {
                allowableTypes.add(vc.getType());
            }
-        } else {
-            allowableTypes.add(VariantContext.Type.SNP);
+
+            if (allowableTypes.size() == 1) {
+                // We didn't find any variation in the eval track, so now let's look at the comp track for allowable types
+                Collection<VariantContext> compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false);
+
+                for ( VariantContext vc : compvcs ) {
+                    allowableTypes.add(vc.getType());
+                }
+            }
        }
+
        return allowableTypes;
    }

@ -478,42 +488,41 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
     * @param tracker         the metadata tracker
     * @param ref             the reference context
     * @param trackNames      the list of track names to process
-     * @param sampleNames     the list of samples to include
     * @param allowableTypes  a set of allowable variation types
     * @param byFilter        if false, only accept PASSing VariantContexts.  Otherwise, accept both PASSing and filtered sites
+     * @param trackPerSample  if false, don't stratify per sample (and don't cut up the VariantContext like we would need to do this)
+     * @param allowNoCalls    if false, don't accept no-call loci from a variant track
     * @return  a mapping of track names to a list of VariantContext objects
     */
-    private HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, Set<String> sampleNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter) {
+    private HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean trackPerSample, boolean allowNoCalls) {
        HashMap<String, HashMap<String, VariantContext>> bindings = new HashMap<String, HashMap<String, VariantContext>>();

        for ( String trackName : trackNames ) {
-            Collection<VariantContext> contexts = tracker == null ? null : tracker.getVariantContexts(ref, trackName, allowableTypes, ref.getLocus(), true, true);
-
-            VariantContext vc = contexts != null && contexts.size() == 1 ? contexts.iterator().next() : null;
-
            HashMap<String, VariantContext> vcs = new HashMap<String, VariantContext>();

+            Collection<VariantContext> contexts = tracker == null ? null : tracker.getVariantContexts(ref, trackName, allowableTypes, ref.getLocus(), true, true);
+            VariantContext vc = contexts != null && contexts.size() == 1 ? contexts.iterator().next() : null;
+
+            // First, filter the VariantContext to represent only the samples for evaluation
            if ( vc != null ) {
-                ArrayList<String> sampleNamesMinusAll = new ArrayList<String>();
+                VariantContext vcsub = vc;

-                for ( String sampleName : sampleNames ) {
-                    VariantContext vcsub = vc;
-
-                    if (!sampleName.equals(ALL_SAMPLE_NAME)) {
-                        vcsub = getSubsetOfVariantContext(vc, sampleName);
-                        sampleNamesMinusAll.add(sampleName);
-                    }
-
-                    if (byFilter || !vcsub.isFiltered()) {
-                        vcs.put(sampleName, vcsub);
-                    }
+                if (vc.hasGenotypes(sampleNamesForEvaluation)) {
+                    vcsub = getSubsetOfVariantContext(vc, sampleNamesForEvaluation);
                }

-                if ( trackName.contains("eval") ) {
-                    VariantContext vcsub = (sampleNamesMinusAll.size() > 0) ? getSubsetOfVariantContext(vc, sampleNamesMinusAll) : vc;
+                if ((byFilter || !vcsub.isFiltered()) && (allowNoCalls || vcsub.getType() != VariantContext.Type.NO_VARIATION)) {
+                    vcs.put(ALL_SAMPLE_NAME, vcsub);
+                }

-                    if (byFilter || !vcsub.isFiltered()) {
-                        vcs.put(ALL_SAMPLE_NAME, vcsub);
+                // Now, if stratifying, split the subsetted vc per sample and add each as a new context
+                if ( trackPerSample ) {
+                    for ( String sampleName : sampleNamesForEvaluation ) {
+                        VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName);
+    
+                        if ((byFilter || !samplevc.isFiltered()) && (allowNoCalls || samplevc.getType() != VariantContext.Type.NO_VARIATION)) {
+                            vcs.put(sampleName, samplevc);
+                        }
                    }
                }

@ -532,35 +541,25 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
     * @param ref          the reference context
     * @param compNames    the list of comp names to process
     * @param evalNames    the list of eval names to process
-     * @param sampleNames  the list of samples to include
     * @return  a mapping of track names to a list of VariantContext objects
     */
-    private HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames, Set<String> sampleNames) {
+    private HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
        HashMap<String, HashMap<String, VariantContext>> vcs = new HashMap<String, HashMap<String, VariantContext>>();

-        Set<String> allSamplesList = new HashSet<String>();
-        allSamplesList.add(ALL_SAMPLE_NAME);
+        EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames);

-        EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, evalNames);
-
-        boolean perSampleIsEnabled = false;
        boolean byFilter = false;
+        boolean perSampleIsEnabled = false;
        for (VariantStratifier vs : stratificationObjects) {
-            if (vs.getClass().getSimpleName().equals("Sample")) {
-                perSampleIsEnabled = true;
-            } else if (vs.getClass().getSimpleName().equals("Filter")) {
+            if (vs.getClass().getSimpleName().equals("Filter")) {
                byFilter = true;
+            } else if (vs.getClass().getSimpleName().equals("Sample")) {
+                perSampleIsEnabled = true;
            }
        }

-        HashMap<String, HashMap<String, VariantContext>> evalBindings;
-        if (perSampleIsEnabled) {
-            evalBindings = bindVariantContexts(tracker, ref, evalNames, sampleNames, allowableTypes, byFilter);
-        } else {
-            evalBindings = bindVariantContexts(tracker, ref, evalNames, allSamplesList, allowableTypes, byFilter);
-        }
-
-        HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allSamplesList, allowableTypes, byFilter);
+        HashMap<String, HashMap<String, VariantContext>> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, perSampleIsEnabled, true);
+        HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false);

        vcs.putAll(compBindings);
        vcs.putAll(evalBindings);
@ -657,13 +656,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        }

        //      track           sample  vc
-        HashMap<String, HashMap<String, VariantContext>> vcs = getVariantContexts(tracker, ref, compNames, evalNames, sampleNames);
+        HashMap<String, HashMap<String, VariantContext>> vcs = getVariantContexts(tracker, ref, compNames, evalNames);

        for ( String compName : compNames ) {
            VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null;

            for ( String evalName : evalNames ) {
-                for ( String sampleName : sampleNames ) {
+                for ( String sampleName : sampleNamesForStratification ) {
                    VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null;

                    HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java
@ -76,10 +76,13 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
    }

    public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-        boolean expectingIndels = false;
+        //boolean expectingIndels = false;

-        boolean compIsGood = expectingIndels ? comp != null && comp.isNotFiltered() && comp.isIndel() : comp != null && comp.isNotFiltered() && comp.isSNP() ;
-        boolean evalIsGood = expectingIndels ? eval != null && eval.isIndel() : eval != null && eval.isSNP() ;
+        //boolean compIsGood = expectingIndels ? comp != null && comp.isNotFiltered() && comp.isIndel() : comp != null && comp.isNotFiltered() && comp.isSNP() ;
+        //boolean evalIsGood = expectingIndels ? eval != null && eval.isIndel() : eval != null && eval.isSNP() ;
+
+        boolean compIsGood = comp != null && comp.isNotFiltered() && comp.isSNP() ;
+        boolean evalIsGood = eval != null && eval.isSNP() ;

        if (compIsGood) nCompSNPs++;           // count the number of comp events
        if (evalIsGood) nEvalSNPs++;           // count the number of eval events
@ -87,8 +90,9 @@ public class CompOverlap extends VariantEvaluator implements StandardEval {
        if (compIsGood && evalIsGood) {
            nSNPsAtComp++;

-            if (!discordantP(eval, comp))     // count whether we're concordant or not with the comp value
+            if (!discordantP(eval, comp)) {     // count whether we're concordant or not with the comp value
                nConcordant++;
+            }
        }

        return null; // we don't capture any interesting sites
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Sample.java
@ -15,7 +15,6 @@ public class Sample extends VariantStratifier {
    @Override
    public void initialize(Set<VariantContextUtils.JexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames) {
        samples = new ArrayList<String>();
-        samples.add(VariantEvalWalker.ALL_SAMPLE_NAME);
        samples.addAll(sampleNames);
    }

--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/NewEvaluationContext.java
@ -52,7 +52,7 @@ public class NewEvaluationContext extends HashMap<VariantStratifier, String> {

    public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) {
        for ( VariantEvaluator evaluation : evaluationInstances.values() ) {
-            //synchronized ( evaluation ) {
+            //synchronized ( this ) {
                // we always call update0 in case the evaluation tracks things like number of bases covered
                //evaluation.update0(tracker, ref, context);

@ -69,9 +69,9 @@ public class NewEvaluationContext extends HashMap<VariantStratifier, String> {

                        break;
                    case 2:
-                        if (eval != null) {
+                        //if (eval != null) {
                            evaluation.update2(eval, comp, tracker, ref, context);
-                        }
+                        //}

                        break;
                    default:
@ -82,8 +82,10 @@ public class NewEvaluationContext extends HashMap<VariantStratifier, String> {
    }

    public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
-        for ( VariantEvaluator evaluation : evaluationInstances.values() ) {
-            evaluation.update0(tracker, ref, context);
-        }
+        //synchronized (this) {
+            for ( VariantEvaluator evaluation : evaluationInstances.values() ) {
+                evaluation.update0(tracker, ref, context);
+            }
+        //}
    }
 }
--- a/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -29,7 +29,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
        String extraArgs = "-L 1:1-10,000,000";
        for (String tests : testsEnumerations) {
            WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -o %s",
-                    1, Arrays.asList("8ddc1c4a86cb3f4c22346497785b23e3"));
+                    1, Arrays.asList("0087b2a096aa99135b065aa9a0fff34c"));
            //executeTestParallel("testSelect1", spec);
            executeTest("testSelect1", spec);
        }
@ -50,7 +50,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
        for (String vcfFile : vcfFiles) {
            WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -B:eval,VCF " + validationDataLocation + vcfFile + " -B:comp,VCF " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s",
                    1,
-                    Arrays.asList("7a6754176b573d14b6be7c808a04929d"));
+                    Arrays.asList("bb16335f9510bcab2bd14a4299afd879"));
            //executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec);
            executeTest("testVEGenotypeConcordance" + vcfFile, spec);
        }
@ -60,8 +60,8 @@ public class VariantEvalIntegrationTest extends WalkerTest {
    @Test
    public void testVESimple() {
        HashMap<String, String> expectations = new HashMap<String, String>();
-        expectations.put("-L 1:1-10,000,000", "ffd1abed44faf1590d9026e478b2f8ee");
-        expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -mvq 0 -EV MendelianViolationEvaluator", "32c2411fbf58bae5750c8229d15b98eb");
+        expectations.put("-L 1:1-10,000,000", "e46e8e7457b338c4cfec62ee7aa51ffe");
+        expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -mvq 0 -EV MendelianViolationEvaluator", "a0554ca0baa097a1761da3f7e8487833");

        for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
            String extraArgs = entry.getKey();
@ -84,10 +84,10 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                " -B:comp_hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf";


-        String matchingMD5 = "6c2fa6573cc57ef8795e9cce2b654d0b";
+        String matchingMD5 = "5050011ad00b859faf2be679830bec90";
        expectations.put("", matchingMD5);
        expectations.put(" -knownName comp_hapmap -knownName dbsnp", matchingMD5);
-        expectations.put(" -knownName comp_hapmap", "6c2fa6573cc57ef8795e9cce2b654d0b");
+        expectations.put(" -knownName comp_hapmap", "5050011ad00b859faf2be679830bec90");
        for (String tests : testsEnumerations) {
            for (Map.Entry<String, String> entry : expectations.entrySet()) {
                String extraArgs2 = entry.getKey();
@ -133,7 +133,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
    @Test
    public void testCompVsEvalAC() {
        String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -EV GenotypeConcordance -B:evalYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/yri.trio.gatk.fake.genotypes.ac.test.vcf";
-        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("929e4ec46fb6957c29803531322bb35e"));
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("113228ffa35e0f67b8e067860c04171f"));
        //executeTestParallel("testACDiscordanceAtAC1EvalAC2Comp",spec);
        executeTest("testCompVsEvalAC",spec);
    }
@ -150,6 +150,14 @@ public class VariantEvalIntegrationTest extends WalkerTest {
        executeTest("testTranches",spec);
    }

+    @Test
+    public void testCompOverlap() {
+        String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals -B:comphapmap,vcf " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf -B:eval,vcf " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s";
+        WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("81377be26bf8fa32339d01c173428f7d"));
+        //executeTestParallel("testTranches",spec);
+        executeTest("testCompOverlap",spec);
+    }
+
 //    @Test
 //    public void testVEValidatePass() {
 //        String extraArgs = "-L 1:1-10,000,000";