A proper validation report, calculating TP, FP, FN, sensitivity, FDR, PPV. Treats comp as a set of sites that have been either filtered (failed in assay), validated (polymorphic among samples), or invalidated (AC=0 or all genotypes = hom-ref). Very useful.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5384 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-06 19:27:40 +00:00
parent af71576a07
commit 2f1e249aed
1 changed files with 165 additions and 0 deletions

View File

@ -0,0 +1,165 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.tags.DataPoint;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.*;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
@Analysis(description = "Assess site accuracy and sensitivity of callset against follow-up validation assay")
public class ValidationReport extends VariantEvaluator implements StandardEval {
// todo -- note this isn't strictly allele away. It's really focused on sites. A/T call at a validated A/G site is currently counted as a TP
@DataPoint(description = "nComp") int nComp = 0;
@DataPoint(description = "TP") int TP = 0;
@DataPoint(description = "FP") int FP = 0;
@DataPoint(description = "FN") int FN = 0;
@DataPoint(description = "TN") int TN = 0;
@DataPoint(description = "Sensitivity") double sensitivity = 0;
@DataPoint(description = "PPV") double PPV = 0;
@DataPoint(description = "FDR") double FDR = 0;
@DataPoint(description = "CompMonoEvalNoCall") int CompMonoEvalNoCall = 0;
@DataPoint(description = "CompMonoEvalFiltered") int CompMonoEvalFiltered = 0;
@DataPoint(description = "CompMonoEvalMono") int CompMonoEvalMono = 0;
@DataPoint(description = "CompMonoEvalPoly") int CompMonoEvalPoly = 0;
@DataPoint(description = "CompPolyEvalNoCall") int CompPolyEvalNoCall = 0;
@DataPoint(description = "CompPolyEvalFiltered") int CompPolyEvalFiltered = 0;
@DataPoint(description = "CompPolyEvalMono") int CompPolyEvalMono = 0;
@DataPoint(description = "CompPolyEvalPoly") int CompPolyEvalPoly = 0;
@DataPoint(description = "CompFiltered") int CompFiltered = 0;
@DataPoint(description = "Eval and comp have different alleles") int nDifferentAlleleSites = 0;
private static final boolean TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED = true;
private static final boolean REQUIRE_IDENTICAL_ALLELES = false;
private enum SiteStatus { NO_CALL, FILTERED, MONO, POLY }
// Counts of ValidationSiteStatus x CallSiteStatus
final int[][] counts = new int[SiteStatus.values().length][SiteStatus.values().length];
@Override public int getComparisonOrder() { return 2; }
@Override public boolean enabled() { return true; }
@Override
public void finalizeEvaluation() {
for ( SiteStatus x : SiteStatus.values() )
CompFiltered += getCounts(SiteStatus.FILTERED, x);
CompMonoEvalNoCall = getCounts(SiteStatus.MONO, SiteStatus.NO_CALL);
CompMonoEvalFiltered = getCounts(SiteStatus.MONO, SiteStatus.FILTERED);
CompMonoEvalMono = getCounts(SiteStatus.MONO, SiteStatus.MONO);
CompMonoEvalPoly = getCounts(SiteStatus.MONO, SiteStatus.POLY);
CompPolyEvalNoCall = getCounts(SiteStatus.POLY, SiteStatus.NO_CALL);
CompPolyEvalFiltered = getCounts(SiteStatus.POLY, SiteStatus.FILTERED);
CompPolyEvalMono = getCounts(SiteStatus.POLY, SiteStatus.MONO);
CompPolyEvalPoly = getCounts(SiteStatus.POLY, SiteStatus.POLY);
TP = CompPolyEvalPoly;
FN = CompPolyEvalNoCall + CompPolyEvalFiltered + CompPolyEvalMono;
FP = CompMonoEvalPoly;
TN = CompMonoEvalNoCall + CompMonoEvalFiltered + CompMonoEvalMono;
for ( SiteStatus x : SiteStatus.values() )
for ( SiteStatus y : SiteStatus.values() )
nComp += getCounts(x, y);
if ( nComp != TP + FN + FP + TN )
throw new ReviewedStingException("BUG: nComp != TP + FN + FP + TN!");
sensitivity = (100.0 * TP) / (TP + FN);
PPV = (100.0 * TP) / (TP + FP);
FDR = (100.0 * FP) / (FP + TP);
}
private int getCounts(SiteStatus comp, SiteStatus eval) {
return counts[comp.ordinal()][eval.ordinal()];
}
@Override
public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( comp != null ) { // we only need to consider sites in comp
if ( REQUIRE_IDENTICAL_ALLELES && (eval != null && haveDifferentAltAlleles(eval, comp)))
nDifferentAlleleSites++;
else {
SiteStatus evalStatus = calcSiteStatus(eval);
SiteStatus compStatus = calcSiteStatus(comp);
counts[compStatus.ordinal()][evalStatus.ordinal()]++;
}
}
return null; // we don't capture any interesting sites
}
//
// helper routines
//
public SiteStatus calcSiteStatus(VariantContext vc) {
if ( vc == null ) return SiteStatus.NO_CALL;
if ( vc.isFiltered() ) return SiteStatus.FILTERED;
if ( ! vc.isVariant() ) return SiteStatus.MONO;
if ( vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
int ac = 0;
if ( vc.getNAlleles() > 2 ) {
return SiteStatus.POLY;
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
// // todo -- omg this is painful. We need a better approach to dealing with multi-valued attributes
// for ( String v : (List<String>)vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// ac += Integer.valueOf(v);
//// System.out.printf(" ac = %d%n", ac);
}
else
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
} else if ( vc.hasGenotypes() ) {
return vc.isPolymorphic() ? SiteStatus.POLY : SiteStatus.MONO;
} else {
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
//return SiteStatus.NO_CALL; // we can't figure out what to do
}
}
public boolean haveDifferentAltAlleles(VariantContext eval, VariantContext comp) {
Set<Allele> evalAlts = eval.getAlternateAlleles();
Set<Allele> compAlts = comp.getAlternateAlleles();
if ( evalAlts.size() != compAlts.size() ) {
return true;
} else {
// same size => every alt from eval must be in comp
for ( Allele a : evalAlts ) {
if ( ! compAlts.contains(a) ) {
// System.out.printf("Different alleles: %s:%d eval=%s comp=%s\n\t\teval=%s\n\t\tcomp=%s%n",
// eval.getChr(), eval.getStart(), eval.getAlleles(), comp.getAlleles(), eval, comp);
return true;
}
}
return false;
}
}
}