Added histogram of variant quality scores broken out by true positive and false positive calls to the GenotypeConcordance module of VariantEval2

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-04-06 13:48:31 +00:00
parent 12e4f88ca7
commit 2d002c56c3
1 changed files with 142 additions and 20 deletions

View File

@ -23,7 +23,7 @@ import java.util.*;
*/
@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
public class GenotypeConcordance extends VariantEvaluator {
protected static Logger logger = Logger.getLogger(GenotypeConcordance.class);
protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class);
// a mapping from allele count to stats
@DataPoint(description = "the frequency statistics for each allele")
@ -33,6 +33,9 @@ public class GenotypeConcordance extends VariantEvaluator {
@DataPoint(name="samples", description = "the concordance statistics for each sample")
SampleStats sampleStats = null;
@DataPoint(description = "the variant quality score histograms for true positive and false positive calls")
QualityScoreHistograms qualityScoreHistograms = null;
private static final int MAX_MISSED_VALIDATION_DATA = 10000;
@ -63,6 +66,93 @@ public class GenotypeConcordance extends VariantEvaluator {
}
}
class QualityScoreHistograms implements TableType {
final int NUM_BINS = 20;
final ArrayList<Double> truePositiveQualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
final ArrayList<Double> falsePositiveQualities = new ArrayList<Double>();
final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out
final int falsePositiveHist[] = new int[NUM_BINS];
final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"};
public Object[] getRowKeys() {
return rowKeys;
}
public Object[] getColumnKeys() {
final String columnKeys[] = new String[NUM_BINS];
for( int iii = 0; iii < NUM_BINS; iii++ ) {
columnKeys[iii] = "histBin" + iii;
}
return columnKeys;
}
public String getName() {
return "QualityScoreHistogram";
}
public String getCell(int x, int y) {
if( x == 0 ) {
return String.valueOf(truePositiveHist[y]);
} else if ( x == 1 ) {
return String.valueOf(falsePositiveHist[y]);
} else {
throw new StingException( "Unknown row in " + getName() + ", row = " + x );
}
}
public String toString() {
String returnString = "";
// output both histogram arrays
returnString += "TP: ";
for( int iii = 0; iii < NUM_BINS; iii++ ) {
returnString += truePositiveHist[iii] + " ";
}
returnString += "\nFP: ";
for( int iii = 0; iii < NUM_BINS; iii++ ) {
returnString += falsePositiveHist[iii] + " ";
}
return returnString;
}
public void incrValue( final double qual, final boolean isTruePositiveCall ) {
if( isTruePositiveCall ) {
truePositiveQualities.add( qual );
} else {
falsePositiveQualities.add( qual );
}
}
public void organizeHistogramTables() {
for( int iii = 0; iii < NUM_BINS; iii++ ) {
truePositiveHist[iii] = 0;
falsePositiveHist[iii] = 0;
}
double maxQual = 0.0;
// Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram
for( final Double qual : truePositiveQualities ) {
if( qual > maxQual ) {
maxQual = qual;
}
}
for( final Double qual : falsePositiveQualities ) {
if( qual > maxQual ) {
maxQual = qual;
}
}
final double binSize = maxQual / ((double) (NUM_BINS-1));
for( final Double qual : truePositiveQualities ) {
truePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
}
for( final Double qual : falsePositiveQualities ) {
falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
}
}
}
// keep a list of the validation data we saw before the first eval data
private HashSet<VariantContext> missedValidationData = new HashSet<VariantContext>();
@ -115,18 +205,24 @@ public class GenotypeConcordance extends VariantEvaluator {
private boolean warnedAboutValidationData = false;
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
String interesting = null;
final String interesting = null;
// sanity check that we at least have either eval or validation data
if (eval == null && !isValidVC(validation))
if (eval == null && !isValidVC(validation)) {
return interesting;
}
if( qualityScoreHistograms == null ) {
qualityScoreHistograms = new QualityScoreHistograms();
}
if (sampleStats == null) {
if (eval != null) {
// initialize the concordance table
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
for (VariantContext vc : missedValidationData)
for (final VariantContext vc : missedValidationData) {
determineStats(null, vc);
}
missedValidationData = null;
} else {
// todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide
@ -149,51 +245,77 @@ public class GenotypeConcordance extends VariantEvaluator {
return interesting; // we don't capture any interesting sites
}
private void determineStats(VariantContext eval, VariantContext validation) {
private void determineStats(final VariantContext eval, final VariantContext validation) {
final boolean validationIsValidVC = isValidVC(validation);
// determine concordance for eval data
if (eval != null) {
for (String sample : eval.getSampleNames()) {
Genotype.Type called = eval.getGenotype(sample).getType();
Genotype.Type truth;
for (final String sample : eval.getSampleNames()) {
final Genotype.Type called = eval.getGenotype(sample).getType();
final Genotype.Type truth;
if (!isValidVC(validation) || !validation.hasGenotype(sample))
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
truth = Genotype.Type.NO_CALL;
else
} else {
truth = validation.getGenotype(sample).getType();
}
sampleStats.incrValue(sample, truth, called);
}
}
// otherwise, mark no-calls for all samples
else {
Genotype.Type called = Genotype.Type.NO_CALL;
final Genotype.Type called = Genotype.Type.NO_CALL;
for (String sample : validation.getSampleNames()) {
Genotype.Type truth = validation.getGenotype(sample).getType();
for (final String sample : validation.getSampleNames()) {
final Genotype.Type truth = validation.getGenotype(sample).getType();
sampleStats.incrValue(sample, truth, called);
}
}
// determine allele count concordance ()
if (isValidVC(validation) && validation.isPolymorphic()) {
if (validationIsValidVC && validation.isPolymorphic()) {
int trueAlleleCount = 0;
for (Allele a : validation.getAlternateAlleles())
for (final Allele a : validation.getAlternateAlleles()) {
trueAlleleCount += validation.getChromosomeCount(a);
}
if (!alleleCountStats.containsKey(trueAlleleCount))
if (!alleleCountStats.containsKey(trueAlleleCount)) {
alleleCountStats.put(trueAlleleCount, new FrequencyStats());
FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
if (eval != null)
}
final FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
if (eval != null) {
stats.nFound++;
else
} else {
stats.nMissed++;
}
}
// TP & FP quality score histograms
if( eval != null && eval.isPolymorphic() && validationIsValidVC ) {
if( eval.getSampleNames().size() == 1 ) { // single sample calls
for( final String sample : eval.getSampleNames() ) { // only one sample
if( validation.hasGenotype(sample) ) {
final Genotype truth = validation.getGenotype(sample);
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() );
}
}
} else { // multi sample calls
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() );
}
}
}
private static boolean isValidVC(VariantContext vc) {
private static boolean isValidVC(final VariantContext vc) {
return (vc != null && !vc.isFiltered());
}
public void finalizeEvaluation() {
qualityScoreHistograms.organizeHistogramTables();
}
}
/**