Variant Quality Score modules in VariantEval2 no longer create huge lists which hold all of the quality scores encountered and instead cast the quality score to an integer and use hash tables. Bug fix for files in which all the quality scores are set to -1.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3146 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-04-09 18:36:06 +00:00
parent 71f38a9199
commit c2a37e4b5c
3 changed files with 108 additions and 96 deletions

View File

@ -86,8 +86,8 @@ public class GenotypeConcordance extends VariantEvaluator {
class QualityScoreHistograms implements TableType { class QualityScoreHistograms implements TableType {
final int NUM_BINS = 20; final int NUM_BINS = 20;
final ArrayList<Double> truePositiveQualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately final HashMap<Integer,Integer> truePositiveQualityScoreMap = new HashMap<Integer,Integer>(); // A HashMap holds all the quality scores until we are able to bin them appropriately
final ArrayList<Double> falsePositiveQualities = new ArrayList<Double>(); final HashMap<Integer,Integer> falsePositiveQualityScoreMap = new HashMap<Integer,Integer>();
final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out
final int falsePositiveHist[] = new int[NUM_BINS]; final int falsePositiveHist[] = new int[NUM_BINS];
final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"}; final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"};
@ -133,10 +133,17 @@ public class GenotypeConcordance extends VariantEvaluator {
} }
public void incrValue( final double qual, final boolean isTruePositiveCall ) { public void incrValue( final double qual, final boolean isTruePositiveCall ) {
HashMap<Integer,Integer> qualScoreMap;
if( isTruePositiveCall ) { if( isTruePositiveCall ) {
truePositiveQualities.add( qual ); qualScoreMap = truePositiveQualityScoreMap;
} else { } else {
falsePositiveQualities.add( qual ); qualScoreMap = falsePositiveQualityScoreMap;
}
final Integer qualKey = Math.round((float) qual);
if( qualScoreMap.containsKey(qualKey) ) {
qualScoreMap.put(qualKey, qualScoreMap.get(qualKey) + 1);
} else {
qualScoreMap.put(qualKey, 1);
} }
} }
@ -146,27 +153,33 @@ public class GenotypeConcordance extends VariantEvaluator {
falsePositiveHist[iii] = 0; falsePositiveHist[iii] = 0;
} }
double maxQual = 0.0; int maxQual = 0;
// Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram // Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram
for( final Double qual : truePositiveQualities ) { for( final Integer qual : truePositiveQualityScoreMap.keySet()) {
if( qual > maxQual ) { if( qual > maxQual ) {
maxQual = qual; maxQual = qual;
} }
} }
for( final Double qual : falsePositiveQualities ) { for( final Integer qual : falsePositiveQualityScoreMap.keySet()) {
if( qual > maxQual ) { if( qual > maxQual ) {
maxQual = qual; maxQual = qual;
} }
} }
final double binSize = maxQual / ((double) (NUM_BINS-1)); final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); //BUGBUG: should be normalized max to min, not max to 0
for( final Double qual : truePositiveQualities ) { for( final Integer qual : truePositiveQualityScoreMap.keySet()) {
truePositiveHist[ (int)Math.floor( qual / binSize ) ]++; final int index = (int)Math.floor( ((double)qual) / binSize );
if(index >= 0) { //BUGBUG: problem when maxQual is zero?
truePositiveHist[ index ] += truePositiveQualityScoreMap.get(qual);
}
} }
for( final Double qual : falsePositiveQualities ) { for( final Integer qual : falsePositiveQualityScoreMap.keySet()) {
falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++; final int index = (int)Math.floor( ((double)qual) / binSize );
if(index >= 0) {
falsePositiveHist[ index ] += falsePositiveQualityScoreMap.get(qual);
}
} }
} }
} }
@ -326,7 +339,7 @@ public class GenotypeConcordance extends VariantEvaluator {
} }
public void finalizeEvaluation() { public void finalizeEvaluation() {
if( qualityScoreHistograms!=null ) { if( qualityScoreHistograms != null ) {
qualityScoreHistograms.organizeHistogramTables(); qualityScoreHistograms.organizeHistogramTables();
} }
} }

View File

@ -2,11 +2,13 @@ package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.playground.utils.report.tags.Analysis; import org.broadinstitute.sting.playground.utils.report.tags.Analysis;
import org.broadinstitute.sting.playground.utils.report.tags.DataPoint; import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
import org.broadinstitute.sting.playground.utils.report.utils.TableType; import org.broadinstitute.sting.playground.utils.report.utils.TableType;
import org.broadinstitute.sting.utils.Pair;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@ -48,13 +50,12 @@ public class VariantQualityScore extends VariantEvaluator {
@DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality") @DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality")
TiTvStats titvStats = null; TiTvStats titvStats = null;
//@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") @DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
//AlleleCountStats alleleCountStats = null; AlleleCountStats alleleCountStats = null;
class TiTvStats implements TableType { class TiTvStats implements TableType {
final int NUM_BINS = 20; final int NUM_BINS = 20;
final ArrayList<Double> qualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately final HashMap<Integer, Pair<Long,Long>> qualByIsTransition = new HashMap<Integer, Pair<Long,Long>>(); // A hashMap holds all the qualities until we are able to bin them appropriately
final ArrayList<Boolean> isTransition = new ArrayList<Boolean>();
final long transitionByQuality[] = new long[NUM_BINS]; final long transitionByQuality[] = new long[NUM_BINS];
final long transversionByQuality[] = new long[NUM_BINS]; final long transversionByQuality[] = new long[NUM_BINS];
final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out
@ -89,9 +90,17 @@ public class VariantQualityScore extends VariantEvaluator {
return returnString; return returnString;
} }
public void incrValue( final double qual, final boolean _isTransition ) { public void incrValue( final double qual, final boolean isTransition ) {
qualities.add(qual); final Integer qualKey = Math.round((float) qual);
isTransition.add(_isTransition); final long numTransition = (isTransition ? 1L : 0L);
final long numTransversion = (isTransition ? 0L : 1L);
if( qualByIsTransition.containsKey(qualKey) ) {
Pair<Long,Long> transitionPair = qualByIsTransition.get(qualKey);
transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion);
qualByIsTransition.put(qualKey, transitionPair);
} else {
qualByIsTransition.put(qualKey, new Pair<Long,Long>(numTransition,numTransversion));
}
} }
public void organizeTiTvTables() { public void organizeTiTvTables() {
@ -101,26 +110,24 @@ public class VariantQualityScore extends VariantEvaluator {
titvByQuality[iii] = 0.0; titvByQuality[iii] = 0.0;
} }
double maxQual = 0.0; int maxQual = 0;
// Calculate the maximum quality score in order to normalize and histogram // Calculate the maximum quality score in order to normalize and histogram
for( final Double qual : qualities ) { for( final Integer qual : qualByIsTransition.keySet() ) {
if( qual > maxQual ) { if( qual > maxQual ) {
maxQual = qual; maxQual = qual;
} }
} }
final double binSize = maxQual / ((double) (NUM_BINS-1)); final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1));
int jjj = 0; for( final Integer qual : qualByIsTransition.keySet() ) {
for( final Double qual : qualities ) { final int index = (int)Math.floor( ((double) qual) / binSize );
final int index = (int)Math.floor( qual / binSize ); if( index >= 0 ) { // BUGBUG: why is there overflow here?
if(isTransition.get(jjj)) { Pair<Long,Long> transitionPair = qualByIsTransition.get(qual);
transitionByQuality[index]++; transitionByQuality[index] += transitionPair.getFirst();
} else { transversionByQuality[index] += transitionPair.getSecond();
transversionByQuality[index]++;
} }
jjj++;
} }
for( int iii = 0; iii < NUM_BINS; iii++ ) { for( int iii = 0; iii < NUM_BINS; iii++ ) {
@ -134,89 +141,71 @@ public class VariantQualityScore extends VariantEvaluator {
} }
} }
/*
class AlleleCountStats implements TableType { class AlleleCountStats implements TableType {
final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>(); final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>(); final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
public Object[] getRowKeys() { public Object[] getRowKeys() {
return new String[]{"sample"}; final int NUM_BINS = qualityListMap.keySet().size();
final String rowKeys[] = new String[NUM_BINS];
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
rowKeys[iii] = "AC" + key;
iii++;
}
return rowKeys;
} }
public Object[] getColumnKeys() { public Object[] getColumnKeys() {
final int NUM_BINS = qualityListMap.keySet().size(); return new String[]{"alleleCount","avgQual"};
final String columnKeys[] = new String[NUM_BINS];
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
columnKeys[iii] = "AC" + key;
iii++;
}
return columnKeys;
} }
public String getName() { public String getName() {
return "TiTvStats"; return "AlleleCountStats";
} }
public String getCell(int x, int y) { public String getCell(int x, int y) {
return String.valueOf(titvByQuality[y]); int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
if(iii == x) {
if(y == 0) { return String.valueOf(key); }
else { return String.valueOf(qualityMap.get(key)); }
}
iii++;
}
return null;
} }
public String toString() { public String toString() {
String returnString = ""; String returnString = "";
// output the ti/tv array // output the quality map
returnString += "titvByQuality: "; returnString += "AlleleCountStats: ";
for( int iii = 0; iii < NUM_BINS; iii++ ) { //for( int iii = 0; iii < NUM_BINS; iii++ ) {
returnString += titvByQuality[iii] + " "; // returnString += titvByQuality[iii] + " ";
} //}
return returnString; return returnString;
} }
public void incrValue( final double qual, final boolean _isTransition ) { public void incrValue( final double qual, final int alleleCount ) {
qualities.add(qual); ArrayList<Double> list = qualityListMap.get(alleleCount);
isTransition.add(_isTransition); if(list==null) { list = new ArrayList<Double>(); }
list.add(qual);
qualityListMap.put(alleleCount, list);
} }
public void organizeTiTvTables() { public void organizeAlleleCountTables() {
for( int iii = 0; iii < NUM_BINS; iii++ ) { for( final Integer key : qualityListMap.keySet() ) {
transitionByQuality[iii] = 0L; final ArrayList<Double> list = qualityListMap.get(key);
transversionByQuality[iii] = 0L; double meanQual = 0.0;
titvByQuality[iii] = 0.0; final double numQuals = (double)list.size();
} for( Double qual : list ) {
meanQual += qual / numQuals;
double maxQual = 0.0;
// Calculate the maximum quality score in order to normalize and histogram
for( final Double qual : qualities ) {
if( qual > maxQual ) {
maxQual = qual;
} }
qualityMap.put(key, meanQual);
} }
final double binSize = maxQual / ((double) (NUM_BINS-1));
int jjj = 0;
for( final Double qual : qualities ) {
final int index = (int)Math.floor( qual / binSize );
if(isTransition.get(jjj)) {
transitionByQuality[index]++;
} else {
transversionByQuality[index]++;
}
jjj++;
}
for( int iii = 0; iii < NUM_BINS; iii++ ) {
if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
} else {
titvByQuality[iii] = 0.0;
}
}
} }
} }
*/
public VariantQualityScore(VariantEval2Walker parent) { public VariantQualityScore(VariantEval2Walker parent) {
// don't do anything // don't do anything
@ -241,17 +230,27 @@ public class VariantQualityScore extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String interesting = null; final String interesting = null;
if(eval != null) { if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( titvStats == null ) { titvStats = new TiTvStats(); } if( titvStats == null ) { titvStats = new TiTvStats(); }
titvStats.incrValue(eval.getPhredScaledQual(), eval.isTransition()); titvStats.incrValue(eval.getPhredScaledQual(), eval.isTransition());
if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
int alternateAlleleCount = 0;
for (final Allele a : eval.getAlternateAlleles()) {
alternateAlleleCount += eval.getChromosomeCount(a);
}
alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
} }
return interesting; // This module doesn't capture any interesting sites, so return null return interesting; // This module doesn't capture any interesting sites, so return null
} }
public void finalizeEvaluation() { public void finalizeEvaluation() {
if( titvStats!=null ) { if( titvStats != null ) {
titvStats.organizeTiTvTables(); titvStats.organizeTiTvTables();
} }
if( alleleCountStats != null ) {
alleleCountStats.organizeAlleleCountTables();
}
} }
} }

View File

@ -18,8 +18,8 @@ public class VariantEval2IntegrationTest extends WalkerTest {
@Test @Test
public void testVE2Simple() { public void testVE2Simple() {
HashMap<String, String> expectations = new HashMap<String, String>(); HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "32d771b62dadc1c164f19bf3a1efc018"); expectations.put("-L 1:1-10,000,000", "44fca2e43dade85ca16fa9e323e4cc9b");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "de42a239847e3466f4526e4781710ce7"); expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "6910a9a49625a8df2ad797d751be9ed6");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) { for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey(); String extraArgs = entry.getKey();
@ -39,10 +39,10 @@ public class VariantEval2IntegrationTest extends WalkerTest {
" -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" + " -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" +
" -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf"; " -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String eqMD5s = "0e961437bec3d4ad28b1511c0c45d13a"; // next two examples should be the same! String eqMD5s = "a525597a34cd3fa84c75263dbd026acb"; // next two examples should be the same!
expectations.put("", eqMD5s); expectations.put("", eqMD5s);
expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s); expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s);
expectations.put(" -known comp_hapmap", "139b10dc0ea968dc035798492ef4ef97"); expectations.put(" -known comp_hapmap", "35feda6c361408755e7f1dae245462d7");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) { for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs2 = entry.getKey(); String extraArgs2 = entry.getKey();
@ -60,7 +60,7 @@ public class VariantEval2IntegrationTest extends WalkerTest {
String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30"; String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30";
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s", WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s",
2, 2,
Arrays.asList("3d4de331f90ef49792d3da3ee9509edc", "a3ce1d70d8ae3874807e9d61994d42af")); Arrays.asList("0d2ac5ea23f8cd3237836080c722711b", "a3ce1d70d8ae3874807e9d61994d42af"));
executeTest("testVE2WriteVCF", spec); executeTest("testVE2WriteVCF", spec);
} }
} }