Added histogram of variant quality scores broken out by true positive and false positive calls to the GenotypeConcordance module of VariantEval2
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3123 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
12e4f88ca7
commit
2d002c56c3
|
|
@ -23,7 +23,7 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
|
@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks")
|
||||||
public class GenotypeConcordance extends VariantEvaluator {
|
public class GenotypeConcordance extends VariantEvaluator {
|
||||||
protected static Logger logger = Logger.getLogger(GenotypeConcordance.class);
|
protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class);
|
||||||
|
|
||||||
// a mapping from allele count to stats
|
// a mapping from allele count to stats
|
||||||
@DataPoint(description = "the frequency statistics for each allele")
|
@DataPoint(description = "the frequency statistics for each allele")
|
||||||
|
|
@ -33,6 +33,9 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
@DataPoint(name="samples", description = "the concordance statistics for each sample")
|
@DataPoint(name="samples", description = "the concordance statistics for each sample")
|
||||||
SampleStats sampleStats = null;
|
SampleStats sampleStats = null;
|
||||||
|
|
||||||
|
@DataPoint(description = "the variant quality score histograms for true positive and false positive calls")
|
||||||
|
QualityScoreHistograms qualityScoreHistograms = null;
|
||||||
|
|
||||||
private static final int MAX_MISSED_VALIDATION_DATA = 10000;
|
private static final int MAX_MISSED_VALIDATION_DATA = 10000;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -63,6 +66,93 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class QualityScoreHistograms implements TableType {
|
||||||
|
final int NUM_BINS = 20;
|
||||||
|
final ArrayList<Double> truePositiveQualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
|
||||||
|
final ArrayList<Double> falsePositiveQualities = new ArrayList<Double>();
|
||||||
|
final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out
|
||||||
|
final int falsePositiveHist[] = new int[NUM_BINS];
|
||||||
|
final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"};
|
||||||
|
|
||||||
|
public Object[] getRowKeys() {
|
||||||
|
return rowKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object[] getColumnKeys() {
|
||||||
|
final String columnKeys[] = new String[NUM_BINS];
|
||||||
|
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||||
|
columnKeys[iii] = "histBin" + iii;
|
||||||
|
}
|
||||||
|
return columnKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return "QualityScoreHistogram";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCell(int x, int y) {
|
||||||
|
if( x == 0 ) {
|
||||||
|
return String.valueOf(truePositiveHist[y]);
|
||||||
|
} else if ( x == 1 ) {
|
||||||
|
return String.valueOf(falsePositiveHist[y]);
|
||||||
|
} else {
|
||||||
|
throw new StingException( "Unknown row in " + getName() + ", row = " + x );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
String returnString = "";
|
||||||
|
// output both histogram arrays
|
||||||
|
returnString += "TP: ";
|
||||||
|
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||||
|
returnString += truePositiveHist[iii] + " ";
|
||||||
|
}
|
||||||
|
returnString += "\nFP: ";
|
||||||
|
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||||
|
returnString += falsePositiveHist[iii] + " ";
|
||||||
|
}
|
||||||
|
return returnString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrValue( final double qual, final boolean isTruePositiveCall ) {
|
||||||
|
if( isTruePositiveCall ) {
|
||||||
|
truePositiveQualities.add( qual );
|
||||||
|
} else {
|
||||||
|
falsePositiveQualities.add( qual );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void organizeHistogramTables() {
|
||||||
|
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||||
|
truePositiveHist[iii] = 0;
|
||||||
|
falsePositiveHist[iii] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double maxQual = 0.0;
|
||||||
|
|
||||||
|
// Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram
|
||||||
|
for( final Double qual : truePositiveQualities ) {
|
||||||
|
if( qual > maxQual ) {
|
||||||
|
maxQual = qual;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for( final Double qual : falsePositiveQualities ) {
|
||||||
|
if( qual > maxQual ) {
|
||||||
|
maxQual = qual;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final double binSize = maxQual / ((double) (NUM_BINS-1));
|
||||||
|
|
||||||
|
for( final Double qual : truePositiveQualities ) {
|
||||||
|
truePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
|
||||||
|
}
|
||||||
|
for( final Double qual : falsePositiveQualities ) {
|
||||||
|
falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// keep a list of the validation data we saw before the first eval data
|
// keep a list of the validation data we saw before the first eval data
|
||||||
private HashSet<VariantContext> missedValidationData = new HashSet<VariantContext>();
|
private HashSet<VariantContext> missedValidationData = new HashSet<VariantContext>();
|
||||||
|
|
||||||
|
|
@ -115,18 +205,24 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
private boolean warnedAboutValidationData = false;
|
private boolean warnedAboutValidationData = false;
|
||||||
|
|
||||||
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
String interesting = null;
|
final String interesting = null;
|
||||||
|
|
||||||
// sanity check that we at least have either eval or validation data
|
// sanity check that we at least have either eval or validation data
|
||||||
if (eval == null && !isValidVC(validation))
|
if (eval == null && !isValidVC(validation)) {
|
||||||
return interesting;
|
return interesting;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( qualityScoreHistograms == null ) {
|
||||||
|
qualityScoreHistograms = new QualityScoreHistograms();
|
||||||
|
}
|
||||||
|
|
||||||
if (sampleStats == null) {
|
if (sampleStats == null) {
|
||||||
if (eval != null) {
|
if (eval != null) {
|
||||||
// initialize the concordance table
|
// initialize the concordance table
|
||||||
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
|
sampleStats = new SampleStats(eval,Genotype.Type.values().length);
|
||||||
for (VariantContext vc : missedValidationData)
|
for (final VariantContext vc : missedValidationData) {
|
||||||
determineStats(null, vc);
|
determineStats(null, vc);
|
||||||
|
}
|
||||||
missedValidationData = null;
|
missedValidationData = null;
|
||||||
} else {
|
} else {
|
||||||
// todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide
|
// todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide
|
||||||
|
|
@ -149,51 +245,77 @@ public class GenotypeConcordance extends VariantEvaluator {
|
||||||
return interesting; // we don't capture any interesting sites
|
return interesting; // we don't capture any interesting sites
|
||||||
}
|
}
|
||||||
|
|
||||||
private void determineStats(VariantContext eval, VariantContext validation) {
|
private void determineStats(final VariantContext eval, final VariantContext validation) {
|
||||||
|
|
||||||
|
final boolean validationIsValidVC = isValidVC(validation);
|
||||||
|
|
||||||
// determine concordance for eval data
|
// determine concordance for eval data
|
||||||
if (eval != null) {
|
if (eval != null) {
|
||||||
for (String sample : eval.getSampleNames()) {
|
for (final String sample : eval.getSampleNames()) {
|
||||||
Genotype.Type called = eval.getGenotype(sample).getType();
|
final Genotype.Type called = eval.getGenotype(sample).getType();
|
||||||
Genotype.Type truth;
|
final Genotype.Type truth;
|
||||||
|
|
||||||
if (!isValidVC(validation) || !validation.hasGenotype(sample))
|
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
|
||||||
truth = Genotype.Type.NO_CALL;
|
truth = Genotype.Type.NO_CALL;
|
||||||
else
|
} else {
|
||||||
truth = validation.getGenotype(sample).getType();
|
truth = validation.getGenotype(sample).getType();
|
||||||
|
}
|
||||||
|
|
||||||
sampleStats.incrValue(sample, truth, called);
|
sampleStats.incrValue(sample, truth, called);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// otherwise, mark no-calls for all samples
|
// otherwise, mark no-calls for all samples
|
||||||
else {
|
else {
|
||||||
Genotype.Type called = Genotype.Type.NO_CALL;
|
final Genotype.Type called = Genotype.Type.NO_CALL;
|
||||||
|
|
||||||
for (String sample : validation.getSampleNames()) {
|
for (final String sample : validation.getSampleNames()) {
|
||||||
Genotype.Type truth = validation.getGenotype(sample).getType();
|
final Genotype.Type truth = validation.getGenotype(sample).getType();
|
||||||
sampleStats.incrValue(sample, truth, called);
|
sampleStats.incrValue(sample, truth, called);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine allele count concordance ()
|
// determine allele count concordance ()
|
||||||
if (isValidVC(validation) && validation.isPolymorphic()) {
|
if (validationIsValidVC && validation.isPolymorphic()) {
|
||||||
int trueAlleleCount = 0;
|
int trueAlleleCount = 0;
|
||||||
for (Allele a : validation.getAlternateAlleles())
|
for (final Allele a : validation.getAlternateAlleles()) {
|
||||||
trueAlleleCount += validation.getChromosomeCount(a);
|
trueAlleleCount += validation.getChromosomeCount(a);
|
||||||
|
}
|
||||||
|
|
||||||
if (!alleleCountStats.containsKey(trueAlleleCount))
|
if (!alleleCountStats.containsKey(trueAlleleCount)) {
|
||||||
alleleCountStats.put(trueAlleleCount, new FrequencyStats());
|
alleleCountStats.put(trueAlleleCount, new FrequencyStats());
|
||||||
FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
|
}
|
||||||
if (eval != null)
|
final FrequencyStats stats = alleleCountStats.get(trueAlleleCount);
|
||||||
|
if (eval != null) {
|
||||||
stats.nFound++;
|
stats.nFound++;
|
||||||
else
|
} else {
|
||||||
stats.nMissed++;
|
stats.nMissed++;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isValidVC(VariantContext vc) {
|
// TP & FP quality score histograms
|
||||||
|
if( eval != null && eval.isPolymorphic() && validationIsValidVC ) {
|
||||||
|
if( eval.getSampleNames().size() == 1 ) { // single sample calls
|
||||||
|
for( final String sample : eval.getSampleNames() ) { // only one sample
|
||||||
|
if( validation.hasGenotype(sample) ) {
|
||||||
|
final Genotype truth = validation.getGenotype(sample);
|
||||||
|
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // multi sample calls
|
||||||
|
qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() );
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isValidVC(final VariantContext vc) {
|
||||||
return (vc != null && !vc.isFiltered());
|
return (vc != null && !vc.isFiltered());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void finalizeEvaluation() {
|
||||||
|
qualityScoreHistograms.organizeHistogramTables();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue