PPV and Sensitivity added to validation tool output; support for arbitrary -sample arguments to subset variant contexts by sample

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2978 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-10 22:28:31 +00:00
parent 40d305bc7e
commit 4f4555c80f
4 changed files with 42 additions and 9 deletions

View File

@ -606,6 +606,14 @@ public class VariantContext {
*/
public boolean hasGenotypes() { return genotypes.size() > 0; }
public boolean hasGenotypes(Collection<String> sampleNames) {
for ( String name : sampleNames ) {
if ( ! genotypes.containsKey(name) )
return false;
}
return true;
}
/**
* @return set of all Genotypes associated with this context
*/

View File

@ -20,10 +20,15 @@ import java.util.Arrays;
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
public class ValidationRate extends VariantEvaluator {
// todo -- subset validation data by list of samples, if provided
// todo -- print out PPV and sensitivity numbers
class SiteStats {
long nPoly = 0, nMono = 0, nNoCall = 0;
double polyPercent() { return rate(nPoly, nPoly + nMono + nNoCall); }
double polyPercent() { return 100 * rate(nPoly, nPoly + nMono + nNoCall); }
}
private SiteStats validationStats = new SiteStats();
@ -47,16 +52,22 @@ public class ValidationRate extends VariantEvaluator {
}
private String summaryLine() {
return String.format("%d %d %.2f %d %d %d %.2f %d %d %d %.2f",
long TP = evalOverlapAtPoly.nPoly + evalOverlapAtMono.nMono;
long FP = evalOverlapAtMono.nPoly + evalOverlapAtPoly.nMono;
long FN = evalOverlapAtPoly.nMono + evalOverlapAtPoly.nNoCall;
return String.format("%d %d %.2f %d %d %d %.2f %d %d %d %.2f %.2f %.2f",
validationStats.nMono, validationStats.nPoly, validationStats.polyPercent(),
evalOverlapAtMono.nMono, evalOverlapAtMono.nPoly, evalOverlapAtMono.nNoCall, evalOverlapAtMono.polyPercent(),
evalOverlapAtPoly.nMono, evalOverlapAtPoly.nPoly, evalOverlapAtPoly.nNoCall, evalOverlapAtPoly.polyPercent());
evalOverlapAtPoly.nMono, evalOverlapAtPoly.nPoly, evalOverlapAtPoly.nNoCall, evalOverlapAtPoly.polyPercent(),
100 * rate(TP, TP + FP), 100 * rate(TP, TP + FN));
}
private static List<String> HEADER =
Arrays.asList("n_mono_in_comp", "n_poly_in_comp", "percent_poly_in_comp",
"n_mono_calls_at_mono_sites", "n_poly_calls_at_mono_sites", "n_nocalls_at_mono_sites", "percent_mono_sites_called_poly",
"n_mono_calls_at_poly_sites", "n_poly_calls_at_poly_sites", "n_nocalls_at_poly_sites", "percent_poly_sites_called_poly");
"n_mono_calls_at_poly_sites", "n_poly_calls_at_poly_sites", "n_nocalls_at_poly_sites", "percent_poly_sites_called_poly",
"PPV", "Sensitivity");
// making it a table
public List<String> getTableHeader() {

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.MutableVariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
@ -110,6 +111,10 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
@Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected String[] KNOWN_NAMES = {"dbsnp"};
@Argument(shortName="sample", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected String[] SAMPLES = {};
private List<String> SAMPLES_LIST = null;
//
// Arguments for Mendelian Violation calculations
//
@ -213,6 +218,8 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
// --------------------------------------------------------------------------------------------------------------
public void initialize() {
SAMPLES_LIST = Arrays.asList(SAMPLES);
determineAllEvalations();
List<VariantContextUtils.JexlVCMatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);
@ -494,6 +501,13 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
throw new StingException("Found multiple variant contexts at " + context.getLocation());
VariantContext vc = contexts.size() == 1 ? contexts.iterator().next() : null;
if ( vc != null && vc.hasGenotypes(SAMPLES_LIST) ) {
//if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc));
vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values());
//if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc));
}
map.put(name, allowExcludes && excludeComp(vc) ? null : vc);
}
}

View File

@ -20,8 +20,8 @@ public class VariantEval2IntegrationTest extends WalkerTest {
@Test
public void testVE2Simple() {
HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "d83605861576db9bc0d50d5c11b67a90");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "be922212c7cb8f8070158dab86949c4b");
expectations.put("-L 1:1-10,000,000", "32b2e9758078b66e6d50d140acb37947");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "5ee420ebf7c2d3c2e3827c0114a6706d");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
@ -41,10 +41,10 @@ public class VariantEval2IntegrationTest extends WalkerTest {
" -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" +
" -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String eqMD5s = "f03913dacc5e9938a0aa00f1e5a031de"; // next two examples should be the same!
String eqMD5s = "ba021a4c963200191710a220a5577753"; // next two examples should be the same!
expectations.put("", eqMD5s);
expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s);
expectations.put(" -known comp_hapmap", "0f886e7042c3999c3d87f848e0b58eb8");
expectations.put(" -known comp_hapmap", "5ce16165f4242d77b4e82c704273c11d");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs2 = entry.getKey();
@ -62,7 +62,7 @@ public class VariantEval2IntegrationTest extends WalkerTest {
String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30";
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s",
2,
Arrays.asList("8162dcf2539e8241fb27cba2055631bf", "a3ce1d70d8ae3874807e9d61994d42af"));
Arrays.asList("0b29285da3ca778b9c8b7f62e99aa72d", "d41d8cd98f00b204e9800998ecf8427e"));
executeTest("testVE2WriteVCF", spec);
}
}