diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java index 8b9ee324a..13e859045 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/GenotypeConcordance.java @@ -23,7 +23,7 @@ import java.util.*; */ @Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks") public class GenotypeConcordance extends VariantEvaluator { - protected static Logger logger = Logger.getLogger(GenotypeConcordance.class); + protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class); // a mapping from allele count to stats @DataPoint(description = "the frequency statistics for each allele") @@ -33,6 +33,9 @@ public class GenotypeConcordance extends VariantEvaluator { @DataPoint(name="samples", description = "the concordance statistics for each sample") SampleStats sampleStats = null; + @DataPoint(description = "the variant quality score histograms for true positive and false positive calls") + QualityScoreHistograms qualityScoreHistograms = null; + private static final int MAX_MISSED_VALIDATION_DATA = 10000; @@ -63,6 +66,93 @@ public class GenotypeConcordance extends VariantEvaluator { } } + class QualityScoreHistograms implements TableType { + final int NUM_BINS = 20; + final ArrayList truePositiveQualities = new ArrayList(); // An ArrayList holds all the qualities until we are able to bin them appropriately + final ArrayList falsePositiveQualities = new ArrayList(); + final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out + final int falsePositiveHist[] = new int[NUM_BINS]; + final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"}; + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object[] getColumnKeys() { + final String columnKeys[] = new String[NUM_BINS]; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + columnKeys[iii] = "histBin" + iii; + } + return columnKeys; + } + + public String getName() { + return "QualityScoreHistogram"; + } + + public String getCell(int x, int y) { + if( x == 0 ) { + return String.valueOf(truePositiveHist[y]); + } else if ( x == 1 ) { + return String.valueOf(falsePositiveHist[y]); + } else { + throw new StingException( "Unknown row in " + getName() + ", row = " + x ); + } + } + + public String toString() { + String returnString = ""; + // output both histogram arrays + returnString += "TP: "; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString += truePositiveHist[iii] + " "; + } + returnString += "\nFP: "; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString += falsePositiveHist[iii] + " "; + } + return returnString; + } + + public void incrValue( final double qual, final boolean isTruePositiveCall ) { + if( isTruePositiveCall ) { + truePositiveQualities.add( qual ); + } else { + falsePositiveQualities.add( qual ); + } + } + + public void organizeHistogramTables() { + for( int iii = 0; iii < NUM_BINS; iii++ ) { + truePositiveHist[iii] = 0; + falsePositiveHist[iii] = 0; + } + + double maxQual = 0.0; + + // Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram + for( final Double qual : truePositiveQualities ) { + if( qual > maxQual ) { + maxQual = qual; + } + } + for( final Double qual : falsePositiveQualities ) { + if( qual > maxQual ) { + maxQual = qual; + } + } + + final double binSize = maxQual / ((double) (NUM_BINS-1)); + + for( final Double qual : truePositiveQualities ) { + truePositiveHist[ (int)Math.floor( qual / binSize ) ]++; + } + for( final Double qual : falsePositiveQualities ) { + falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++; + } + } + } + // keep a list of the validation data we saw before the first eval data private HashSet missedValidationData = new HashSet(); @@ -115,18 +205,24 @@ public class GenotypeConcordance extends VariantEvaluator { private boolean warnedAboutValidationData = false; public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - String interesting = null; + final String interesting = null; // sanity check that we at least have either eval or validation data - if (eval == null && !isValidVC(validation)) + if (eval == null && !isValidVC(validation)) { return interesting; + } + + if( qualityScoreHistograms == null ) { + qualityScoreHistograms = new QualityScoreHistograms(); + } if (sampleStats == null) { if (eval != null) { // initialize the concordance table sampleStats = new SampleStats(eval,Genotype.Type.values().length); - for (VariantContext vc : missedValidationData) + for (final VariantContext vc : missedValidationData) { determineStats(null, vc); + } missedValidationData = null; } else { // todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide @@ -149,51 +245,77 @@ public class GenotypeConcordance extends VariantEvaluator { return interesting; // we don't capture any interesting sites } - private void determineStats(VariantContext eval, VariantContext validation) { + private void determineStats(final VariantContext eval, final VariantContext validation) { + + final boolean validationIsValidVC = isValidVC(validation); // determine concordance for eval data if (eval != null) { - for (String sample : eval.getSampleNames()) { - Genotype.Type called = eval.getGenotype(sample).getType(); - Genotype.Type truth; + for (final String sample : eval.getSampleNames()) { + final Genotype.Type called = eval.getGenotype(sample).getType(); + final Genotype.Type truth; - if (!isValidVC(validation) || !validation.hasGenotype(sample)) + if (!validationIsValidVC || !validation.hasGenotype(sample)) { truth = Genotype.Type.NO_CALL; - else + } else { truth = validation.getGenotype(sample).getType(); + } sampleStats.incrValue(sample, truth, called); } } // otherwise, mark no-calls for all samples else { - Genotype.Type called = Genotype.Type.NO_CALL; + final Genotype.Type called = Genotype.Type.NO_CALL; - for (String sample : validation.getSampleNames()) { - Genotype.Type truth = validation.getGenotype(sample).getType(); + for (final String sample : validation.getSampleNames()) { + final Genotype.Type truth = validation.getGenotype(sample).getType(); sampleStats.incrValue(sample, truth, called); } } // determine allele count concordance () - if (isValidVC(validation) && validation.isPolymorphic()) { + if (validationIsValidVC && validation.isPolymorphic()) { int trueAlleleCount = 0; - for (Allele a : validation.getAlternateAlleles()) + for (final Allele a : validation.getAlternateAlleles()) { trueAlleleCount += validation.getChromosomeCount(a); + } - if (!alleleCountStats.containsKey(trueAlleleCount)) + if (!alleleCountStats.containsKey(trueAlleleCount)) { alleleCountStats.put(trueAlleleCount, new FrequencyStats()); - FrequencyStats stats = alleleCountStats.get(trueAlleleCount); - if (eval != null) + } + final FrequencyStats stats = alleleCountStats.get(trueAlleleCount); + if (eval != null) { stats.nFound++; - else + } else { stats.nMissed++; + + } + } + + // TP & FP quality score histograms + if( eval != null && eval.isPolymorphic() && validationIsValidVC ) { + if( eval.getSampleNames().size() == 1 ) { // single sample calls + for( final String sample : eval.getSampleNames() ) { // only one sample + if( validation.hasGenotype(sample) ) { + final Genotype truth = validation.getGenotype(sample); + qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() ); + } + } + } else { // multi sample calls + qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() ); + } + } } - private static boolean isValidVC(VariantContext vc) { + private static boolean isValidVC(final VariantContext vc) { return (vc != null && !vc.isFiltered()); } + + public void finalizeEvaluation() { + qualityScoreHistograms.organizeHistogramTables(); + } } /**