From dede3a30e9171e70de5d1e7ee0e7613e509ce3cd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Jul 2012 18:00:16 -0700 Subject: [PATCH] Improvements to the validation report of VariantEval -- If eval has genotypes and comp has genotypes, then subset the genotypes of comp down to the samples being evaluated when considering TP, FP, FN, TN status. This is important in the case where you want to use this to assess, for example, the quality of calls on NA12878 but you have a CEU trio comp VCF. The previous version was counting sites polymorphic in mom against the calls in NA12878. -- Added testdata VCF and integrationtests to ensure this behavior continues in the future -- TODO: actually run integration tests when I have an internet connection --- .../evaluators/ValidationReport.java | 6 ++-- .../VariantEvalIntegrationTest.java | 33 ++++++++++++------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 044a764c3..a2bcdaf1d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Collection; +import java.util.Set; /** * The Broad Institute @@ -102,9 +103,10 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { nDifferentAlleleSites++; else { SiteStatus evalStatus = calcSiteStatus(eval); - if ( comp.hasGenotypes() && ! getWalker().getSampleNamesForEvaluation().isEmpty() && comp.hasGenotypes(getWalker().getSampleNamesForEvaluation()) ) + final Set evalSamples = getWalker().getSampleNamesForEvaluation(); + if ( comp.hasGenotypes() && ! evalSamples.isEmpty() && comp.hasGenotypes(evalSamples) ) // if we have genotypes in both eval and comp, subset comp down just the samples in eval - comp = comp.subContextFromSamples(eval.getSampleNames(), false); + comp = comp.subContextFromSamples(evalSamples, false); SiteStatus compStatus = calcSiteStatus(comp); counts[compStatus.ordinal()][evalStatus.ordinal()]++; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d623e921a..dbedf49e5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval/"; @@ -617,25 +620,33 @@ public class VariantEvalIntegrationTest extends WalkerTest { // Test validation report is doing the right thing with sites only and genotypes files // where the validation comp has more genotypes than eval // - public void testValidationReport(final String comp, final String md5) { + @Test(dataProvider = "testValidationReportData") + public void testValidationReport(final String name, final String eval, final String comp, final String md5) { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T VariantEval", "-R " + b37KGReference, - "-eval " + privateTestDir + "/validationReportEval.vcf ", - "-L 20:10,000,000-10,000,010 -noST -noEV -EV ValidationReport -o %s" + "-eval " + eval, + "-comp " + comp, + "-L 20:10,000,000-10,000,010 -noST -noEV -EV ValidationReport -o %s" ), 1, Arrays.asList(md5)); - executeTest("testValidationReport with comp " + comp, spec); - } - - @Test public void testValidationReportSites() { - testValidationReport(privateTestDir + "/validationReportComp.noGenotypes.vcf", "f0dbb848a94b451e42765b0cb9d09ee2"); - } - @Test public void testValidationReportSubsetGenotypes() { - testValidationReport(privateTestDir + "/validationReportComp.vcf", "73790b530595fcbd467a88475ea9717f"); + executeTest("testValidationReport with " + name, spec); } + @DataProvider(name = "testValidationReportData") + public Object[][] testValidationReportData() { + final String compGenotypes = privateTestDir + "/validationReportComp.vcf"; + final String compSites = privateTestDir + "/validationReportComp.noGenotypes.vcf"; + final String evalGenotypes = privateTestDir + "/validationReportEval.vcf"; + final String evalSites = privateTestDir + "/validationReportEval.noGenotypes.vcf"; + List tests = new ArrayList(); + tests.add(new Object[]{"sites/sites", evalSites, compSites, ""}); + tests.add(new Object[]{"sites/genotypes", evalSites, compGenotypes, ""}); + tests.add(new Object[]{"genotypes/sites", evalGenotypes, compSites, ""}); + tests.add(new Object[]{"genotypes/genotypes", evalGenotypes, compGenotypes, ""}); + return tests.toArray(new Object[][]{}); + } }