Added histogram of variant quality scores broken out by true positive and false positive calls to the GenotypeConcordance module of VariantEval2
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
12e4f88ca7
commit
2d002c56c3
|
|
@ -23,7 +23,7 @@ import java.util.*;
|
|||
*/
|
||||
@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
|
||||
public class GenotypeConcordance extends VariantEvaluator {
|
||||
protected static Logger logger = Logger.getLogger(GenotypeConcordance.class);
|
||||
protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class);
|
||||
|
||||
// a mapping from allele count to stats
|
||||
@DataPoint(description = "the frequency statistics for each allele")
|
||||
|
|
@ -33,6 +33,9 @@ public class GenotypeConcordance extends VariantEvaluator {
|
|||
@DataPoint(name="samples", description = "the concordance statistics for each sample")
|
||||
SampleStats sampleStats = null;
|
||||
|
||||
@DataPoint(description = "the variant quality score histograms for true positive and false positive calls")
|
||||
QualityScoreHistograms qualityScoreHistograms = null;
|
||||
|
||||
private static final int MAX_MISSED_VALIDATION_DATA = 10000;
|
||||
|
||||
|
||||
|
|
@ -63,6 +66,93 @@ public class GenotypeConcordance extends VariantEvaluator {
|
|||
}
|
||||
}
|
||||
|
||||
class QualityScoreHistograms implements TableType {
|
||||
final int NUM_BINS = 20;
|
||||
final ArrayList<Double> truePositiveQualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
|
||||
final ArrayList<Double> falsePositiveQualities = new ArrayList<Double>();
|
||||
final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out
|
||||
final int falsePositiveHist[] = new int[NUM_BINS];
|
||||
final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"};
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
return rowKeys;
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
final String columnKeys[] = new String[NUM_BINS];
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
columnKeys[iii] = "histBin" + iii;
|
||||
}
|
||||
return columnKeys;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "QualityScoreHistogram";
|
||||
}
|
||||
|
||||
public String getCell(int x, int y) {
|
||||
if( x == 0 ) {
|
||||
return String.valueOf(truePositiveHist[y]);
|
||||
} else if ( x == 1 ) {
|
||||
return String.valueOf(falsePositiveHist[y]);
|
||||
} else {
|
||||
throw new StingException( "Unknown row in " + getName() + ", row = " + x );
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String returnString = "";
|
||||
// output both histogram arrays
|
||||
returnString += "TP: ";
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
returnString += truePositiveHist[iii] + " ";
|
||||
}
|
||||
returnString += "\nFP: ";
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
returnString += falsePositiveHist[iii] + " ";
|
||||
}
|
||||
return returnString;
|
||||
}
|
||||
|
||||
public void incrValue( final double qual, final boolean isTruePositiveCall ) {
|
||||
if( isTruePositiveCall ) {
|
||||
truePositiveQualities.add( qual );
|
||||
} else {
|
||||
falsePositiveQualities.add( qual );
|
||||
}
|
||||
}
|
||||
|
||||
public void organizeHistogramTables() {
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
truePositiveHist[iii] = 0;
|
||||
falsePositiveHist[iii] = 0;
|
||||
}
|
||||
|
||||
double maxQual = 0.0;
|
||||
|
||||
// Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram
|
||||
for( final Double qual : truePositiveQualities ) {
|
||||
if( qual > maxQual ) {
|
||||
maxQual = qual;
|
||||
}
|
||||
}
|
||||
for( final Double qual : falsePositiveQualities ) {
|
||||
if( qual > maxQual ) {
|
||||
maxQual = qual;
|
||||
}
|
||||
}
|
||||
|
||||
final double binSize = maxQual / ((double) (NUM_BINS-1));
|
||||
|
||||
for( final Double qual : truePositiveQualities ) {
|
||||
truePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
|
||||
}
|
||||
for( final Double qual : falsePositiveQualities ) {
|
||||
falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep a list of the validation data we saw before the first eval data
|
||||
private HashSet<VariantContext> missedValidationData = new HashSet<VariantContext>();
|
||||
|
||||
|
|
@ -115,18 +205,24 @@ public class GenotypeConcordance extends VariantEvaluator {
|
|||
private boolean warnedAboutValidationData = false;
|
||||
|
||||
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
String interesting = null;
|
||||
final String interesting = null;
|
||||
|
||||
// sanity check that we at least have either eval or validation data
|
||||
if (eval == null && !isValidVC(validation))
|
||||
if (eval == null && !isValidVC(validation)) {
|
||||
return interesting;
|
||||
}
|
||||
|
||||
if( qualityScoreHistograms == null ) {
|
||||
qualityScoreHistograms = new QualityScoreHistograms();
|
||||
}
|
||||
|
||||
if (sampleStats == null) {
|
||||
if (eval != null) {
|
||||
// initialize the concordance table
|
||||
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
|
||||
for (VariantContext vc : missedValidationData)
|
||||
for (final VariantContext vc : missedValidationData) {
|
||||
determineStats(null, vc);
|
||||
}
|
||||
missedValidationData = null;
|
||||
} else {
|
||||
// todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide
|
||||
|
|
@ -149,51 +245,77 @@ public class GenotypeConcordance extends VariantEvaluator {
|
|||
return interesting; // we don't capture any interesting sites
|
||||
}
|
||||
|
||||
private void determineStats(VariantContext eval, VariantContext validation) {
|
||||
private void determineStats(final VariantContext eval, final VariantContext validation) {
|
||||
|
||||
final boolean validationIsValidVC = isValidVC(validation);
|
||||
|
||||
// determine concordance for eval data
|
||||
if (eval != null) {
|
||||
for (String sample : eval.getSampleNames()) {
|
||||
Genotype.Type called = eval.getGenotype(sample).getType();
|
||||
Genotype.Type truth;
|
||||
for (final String sample : eval.getSampleNames()) {
|
||||
final Genotype.Type called = eval.getGenotype(sample).getType();
|
||||
final Genotype.Type truth;
|
||||
|
||||
if (!isValidVC(validation) || !validation.hasGenotype(sample))
|
||||
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
|
||||
truth = Genotype.Type.NO_CALL;
|
||||
else
|
||||
} else {
|
||||
truth = validation.getGenotype(sample).getType();
|
||||
}
|
||||
|
||||
sampleStats.incrValue(sample, truth, called);
|
||||
}
|
||||
}
|
||||
// otherwise, mark no-calls for all samples
|
||||
else {
|
||||
Genotype.Type called = Genotype.Type.NO_CALL;
|
||||
final Genotype.Type called = Genotype.Type.NO_CALL;
|
||||
|
||||
for (String sample : validation.getSampleNames()) {
|
||||
Genotype.Type truth = validation.getGenotype(sample).getType();
|
||||
for (final String sample : validation.getSampleNames()) {
|
||||
final Genotype.Type truth = validation.getGenotype(sample).getType();
|
||||
sampleStats.incrValue(sample, truth, called);
|
||||
}
|
||||
}
|
||||
|
||||
// determine allele count concordance ()
|
||||
if (isValidVC(validation) && validation.isPolymorphic()) {
|
||||
if (validationIsValidVC && validation.isPolymorphic()) {
|
||||
int trueAlleleCount = 0;
|
||||
for (Allele a : validation.getAlternateAlleles())
|
||||
for (final Allele a : validation.getAlternateAlleles()) {
|
||||
trueAlleleCount += validation.getChromosomeCount(a);
|
||||
}
|
||||
|
||||
if (!alleleCountStats.containsKey(trueAlleleCount))
|
||||
if (!alleleCountStats.containsKey(trueAlleleCount)) {
|
||||
alleleCountStats.put(trueAlleleCount, new FrequencyStats());
|
||||
FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
|
||||
if (eval != null)
|
||||
}
|
||||
final FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
|
||||
if (eval != null) {
|
||||
stats.nFound++;
|
||||
else
|
||||
} else {
|
||||
stats.nMissed++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// TP & FP quality score histograms
|
||||
if( eval != null && eval.isPolymorphic() && validationIsValidVC ) {
|
||||
if( eval.getSampleNames().size() == 1 ) { // single sample calls
|
||||
for( final String sample : eval.getSampleNames() ) { // only one sample
|
||||
if( validation.hasGenotype(sample) ) {
|
||||
final Genotype truth = validation.getGenotype(sample);
|
||||
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() );
|
||||
}
|
||||
}
|
||||
} else { // multi sample calls
|
||||
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() );
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isValidVC(VariantContext vc) {
|
||||
private static boolean isValidVC(final VariantContext vc) {
|
||||
return (vc != null && !vc.isFiltered());
|
||||
}
|
||||
|
||||
public void finalizeEvaluation() {
|
||||
qualityScoreHistograms.organizeHistogramTables();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue