Variant Quality Score modules in VariantEval2 no longer create huge lists which hold all of the quality scores encountered and instead cast the quality score to an integer and use hash tables. Bug fix for files in which all the quality scores are set to -1.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3146 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-04-09 18:36:06 +00:00
parent 71f38a9199
commit c2a37e4b5c
3 changed files with 108 additions and 96 deletions

View File

@ -86,8 +86,8 @@ public class GenotypeConcordance extends VariantEvaluator {
class QualityScoreHistograms implements TableType {
final int NUM_BINS = 20;
final ArrayList<Double> truePositiveQualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
final ArrayList<Double> falsePositiveQualities = new ArrayList<Double>();
final HashMap<Integer,Integer> truePositiveQualityScoreMap = new HashMap<Integer,Integer>(); // A HashMap holds all the quality scores until we are able to bin them appropriately
final HashMap<Integer,Integer> falsePositiveQualityScoreMap = new HashMap<Integer,Integer>();
final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out
final int falsePositiveHist[] = new int[NUM_BINS];
final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"};
@ -133,10 +133,17 @@ public class GenotypeConcordance extends VariantEvaluator {
}
public void incrValue( final double qual, final boolean isTruePositiveCall ) {
HashMap<Integer,Integer> qualScoreMap;
if( isTruePositiveCall ) {
truePositiveQualities.add( qual );
qualScoreMap = truePositiveQualityScoreMap;
} else {
falsePositiveQualities.add( qual );
qualScoreMap = falsePositiveQualityScoreMap;
}
final Integer qualKey = Math.round((float) qual);
if( qualScoreMap.containsKey(qualKey) ) {
qualScoreMap.put(qualKey, qualScoreMap.get(qualKey) + 1);
} else {
qualScoreMap.put(qualKey, 1);
}
}
@ -146,27 +153,33 @@ public class GenotypeConcordance extends VariantEvaluator {
falsePositiveHist[iii] = 0;
}
double maxQual = 0.0;
int maxQual = 0;
// Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram
for( final Double qual : truePositiveQualities ) {
for( final Integer qual : truePositiveQualityScoreMap.keySet()) {
if( qual > maxQual ) {
maxQual = qual;
}
}
for( final Double qual : falsePositiveQualities ) {
for( final Integer qual : falsePositiveQualityScoreMap.keySet()) {
if( qual > maxQual ) {
maxQual = qual;
}
}
final double binSize = maxQual / ((double) (NUM_BINS-1));
for( final Double qual : truePositiveQualities ) {
truePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); //BUGBUG: should be normalized max to min, not max to 0
for( final Integer qual : truePositiveQualityScoreMap.keySet()) {
final int index = (int)Math.floor( ((double)qual) / binSize );
if(index >= 0) { //BUGBUG: problem when maxQual is zero?
truePositiveHist[ index ] += truePositiveQualityScoreMap.get(qual);
}
}
for( final Double qual : falsePositiveQualities ) {
falsePositiveHist[ (int)Math.floor( qual / binSize ) ]++;
for( final Integer qual : falsePositiveQualityScoreMap.keySet()) {
final int index = (int)Math.floor( ((double)qual) / binSize );
if(index >= 0) {
falsePositiveHist[ index ] += falsePositiveQualityScoreMap.get(qual);
}
}
}
}
@ -326,7 +339,7 @@ public class GenotypeConcordance extends VariantEvaluator {
}
public void finalizeEvaluation() {
if( qualityScoreHistograms!=null ) {
if( qualityScoreHistograms != null ) {
qualityScoreHistograms.organizeHistogramTables();
}
}

View File

@ -2,11 +2,13 @@ package org.broadinstitute.sting.oneoffprojects.walkers.varianteval2;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.playground.utils.report.tags.Analysis;
import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
import org.broadinstitute.sting.playground.utils.report.utils.TableType;
import org.broadinstitute.sting.utils.Pair;
import java.util.ArrayList;
import java.util.HashMap;
@ -48,13 +50,12 @@ public class VariantQualityScore extends VariantEvaluator {
@DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality")
TiTvStats titvStats = null;
//@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
//AlleleCountStats alleleCountStats = null;
@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
AlleleCountStats alleleCountStats = null;
class TiTvStats implements TableType {
final int NUM_BINS = 20;
final ArrayList<Double> qualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
final ArrayList<Boolean> isTransition = new ArrayList<Boolean>();
final HashMap<Integer, Pair<Long,Long>> qualByIsTransition = new HashMap<Integer, Pair<Long,Long>>(); // A hashMap holds all the qualities until we are able to bin them appropriately
final long transitionByQuality[] = new long[NUM_BINS];
final long transversionByQuality[] = new long[NUM_BINS];
final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out
@ -89,9 +90,17 @@ public class VariantQualityScore extends VariantEvaluator {
return returnString;
}
public void incrValue( final double qual, final boolean _isTransition ) {
qualities.add(qual);
isTransition.add(_isTransition);
public void incrValue( final double qual, final boolean isTransition ) {
final Integer qualKey = Math.round((float) qual);
final long numTransition = (isTransition ? 1L : 0L);
final long numTransversion = (isTransition ? 0L : 1L);
if( qualByIsTransition.containsKey(qualKey) ) {
Pair<Long,Long> transitionPair = qualByIsTransition.get(qualKey);
transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion);
qualByIsTransition.put(qualKey, transitionPair);
} else {
qualByIsTransition.put(qualKey, new Pair<Long,Long>(numTransition,numTransversion));
}
}
public void organizeTiTvTables() {
@ -101,26 +110,24 @@ public class VariantQualityScore extends VariantEvaluator {
titvByQuality[iii] = 0.0;
}
double maxQual = 0.0;
int maxQual = 0;
// Calculate the maximum quality score in order to normalize and histogram
for( final Double qual : qualities ) {
for( final Integer qual : qualByIsTransition.keySet() ) {
if( qual > maxQual ) {
maxQual = qual;
}
}
final double binSize = maxQual / ((double) (NUM_BINS-1));
final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1));
int jjj = 0;
for( final Double qual : qualities ) {
final int index = (int)Math.floor( qual / binSize );
if(isTransition.get(jjj)) {
transitionByQuality[index]++;
} else {
transversionByQuality[index]++;
for( final Integer qual : qualByIsTransition.keySet() ) {
final int index = (int)Math.floor( ((double) qual) / binSize );
if( index >= 0 ) { // BUGBUG: why is there overflow here?
Pair<Long,Long> transitionPair = qualByIsTransition.get(qual);
transitionByQuality[index] += transitionPair.getFirst();
transversionByQuality[index] += transitionPair.getSecond();
}
jjj++;
}
for( int iii = 0; iii < NUM_BINS; iii++ ) {
@ -134,89 +141,71 @@ public class VariantQualityScore extends VariantEvaluator {
}
}
/*
class AlleleCountStats implements TableType {
final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
public Object[] getRowKeys() {
return new String[]{"sample"};
public Object[] getRowKeys() {
final int NUM_BINS = qualityListMap.keySet().size();
final String rowKeys[] = new String[NUM_BINS];
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
rowKeys[iii] = "AC" + key;
iii++;
}
return rowKeys;
}
public Object[] getColumnKeys() {
final int NUM_BINS = qualityListMap.keySet().size();
final String columnKeys[] = new String[NUM_BINS];
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
columnKeys[iii] = "AC" + key;
iii++;
}
return columnKeys;
return new String[]{"alleleCount","avgQual"};
}
public String getName() {
return "TiTvStats";
return "AlleleCountStats";
}
public String getCell(int x, int y) {
return String.valueOf(titvByQuality[y]);
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
if(iii == x) {
if(y == 0) { return String.valueOf(key); }
else { return String.valueOf(qualityMap.get(key)); }
}
iii++;
}
return null;
}
public String toString() {
String returnString = "";
// output the ti/tv array
returnString += "titvByQuality: ";
for( int iii = 0; iii < NUM_BINS; iii++ ) {
returnString += titvByQuality[iii] + " ";
}
// output the quality map
returnString += "AlleleCountStats: ";
//for( int iii = 0; iii < NUM_BINS; iii++ ) {
// returnString += titvByQuality[iii] + " ";
//}
return returnString;
}
public void incrValue( final double qual, final boolean _isTransition ) {
qualities.add(qual);
isTransition.add(_isTransition);
public void incrValue( final double qual, final int alleleCount ) {
ArrayList<Double> list = qualityListMap.get(alleleCount);
if(list==null) { list = new ArrayList<Double>(); }
list.add(qual);
qualityListMap.put(alleleCount, list);
}
public void organizeTiTvTables() {
for( int iii = 0; iii < NUM_BINS; iii++ ) {
transitionByQuality[iii] = 0L;
transversionByQuality[iii] = 0L;
titvByQuality[iii] = 0.0;
}
double maxQual = 0.0;
// Calculate the maximum quality score in order to normalize and histogram
for( final Double qual : qualities ) {
if( qual > maxQual ) {
maxQual = qual;
public void organizeAlleleCountTables() {
for( final Integer key : qualityListMap.keySet() ) {
final ArrayList<Double> list = qualityListMap.get(key);
double meanQual = 0.0;
final double numQuals = (double)list.size();
for( Double qual : list ) {
meanQual += qual / numQuals;
}
qualityMap.put(key, meanQual);
}
final double binSize = maxQual / ((double) (NUM_BINS-1));
int jjj = 0;
for( final Double qual : qualities ) {
final int index = (int)Math.floor( qual / binSize );
if(isTransition.get(jjj)) {
transitionByQuality[index]++;
} else {
transversionByQuality[index]++;
}
jjj++;
}
for( int iii = 0; iii < NUM_BINS; iii++ ) {
if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
} else {
titvByQuality[iii] = 0.0;
}
}
}
}
*/
public VariantQualityScore(VariantEval2Walker parent) {
// don't do anything
@ -241,17 +230,27 @@ public class VariantQualityScore extends VariantEvaluator {
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String interesting = null;
if(eval != null) {
if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites)
if( titvStats == null ) { titvStats = new TiTvStats(); }
titvStats.incrValue(eval.getPhredScaledQual(), eval.isTransition());
if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); }
int alternateAlleleCount = 0;
for (final Allele a : eval.getAlternateAlleles()) {
alternateAlleleCount += eval.getChromosomeCount(a);
}
alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount);
}
return interesting; // This module doesn't capture any interesting sites, so return null
}
public void finalizeEvaluation() {
if( titvStats!=null ) {
if( titvStats != null ) {
titvStats.organizeTiTvTables();
}
if( alleleCountStats != null ) {
alleleCountStats.organizeAlleleCountTables();
}
}
}

View File

@ -18,8 +18,8 @@ public class VariantEval2IntegrationTest extends WalkerTest {
@Test
public void testVE2Simple() {
HashMap<String, String> expectations = new HashMap<String, String>();
expectations.put("-L 1:1-10,000,000", "32d771b62dadc1c164f19bf3a1efc018");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "de42a239847e3466f4526e4781710ce7");
expectations.put("-L 1:1-10,000,000", "44fca2e43dade85ca16fa9e323e4cc9b");
expectations.put("-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 0", "6910a9a49625a8df2ad797d751be9ed6");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs = entry.getKey();
@ -39,10 +39,10 @@ public class VariantEval2IntegrationTest extends WalkerTest {
" -B dbsnp_130,dbSNP," + GATKDataLocation + "dbsnp_130_b36.rod" +
" -B comp_hapmap,VCF," + validationDataLocation + "CEU_hapmap_nogt_23.vcf";
String eqMD5s = "0e961437bec3d4ad28b1511c0c45d13a"; // next two examples should be the same!
String eqMD5s = "a525597a34cd3fa84c75263dbd026acb"; // next two examples should be the same!
expectations.put("", eqMD5s);
expectations.put(" -known comp_hapmap -known dbsnp", eqMD5s);
expectations.put(" -known comp_hapmap", "139b10dc0ea968dc035798492ef4ef97");
expectations.put(" -known comp_hapmap", "35feda6c361408755e7f1dae245462d7");
for ( Map.Entry<String, String> entry : expectations.entrySet() ) {
String extraArgs2 = entry.getKey();
@ -60,7 +60,7 @@ public class VariantEval2IntegrationTest extends WalkerTest {
String extraArgs = "-L 1:1-10,000,000 -family NA19238+NA19239=NA19240 -MVQ 30";
WalkerTestSpec spec = new WalkerTestSpec( root + " " + extraArgs + " -o %s -outputVCF %s",
2,
Arrays.asList("3d4de331f90ef49792d3da3ee9509edc", "a3ce1d70d8ae3874807e9d61994d42af"));
Arrays.asList("0d2ac5ea23f8cd3237836080c722711b", "a3ce1d70d8ae3874807e9d61994d42af"));
executeTest("testVE2WriteVCF", spec);
}
}