diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantQualityScore.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantQualityScore.java index a89bae98b..9d7bbad49 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantQualityScore.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantQualityScore.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.playground.utils.report.tags.DataPoint; import org.broadinstitute.sting.playground.utils.report.utils.TableType; import java.util.ArrayList; +import java.util.HashMap; /* * Copyright (c) 2010 The Broad Institute @@ -47,6 +48,9 @@ public class VariantQualityScore extends VariantEvaluator { @DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality") TiTvStats titvStats = null; + //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") + //AlleleCountStats alleleCountStats = null; + class TiTvStats implements TableType { final int NUM_BINS = 20; final ArrayList qualities = new ArrayList(); // An ArrayList holds all the qualities until we are able to bin them appropriately @@ -130,6 +134,90 @@ public class VariantQualityScore extends VariantEvaluator { } } + /* + class AlleleCountStats implements TableType { + final HashMap> qualityListMap = new HashMap>(); + final HashMap qualityMap = new HashMap(); + + public Object[] getRowKeys() { + return new String[]{"sample"}; + } + + public Object[] getColumnKeys() { + final int NUM_BINS = qualityListMap.keySet().size(); + final String columnKeys[] = new String[NUM_BINS]; + int iii = 0; + for( final Integer key : qualityListMap.keySet() ) { + columnKeys[iii] = "AC" + key; + iii++; + } + return columnKeys; + } + + public String getName() { + return "TiTvStats"; + } + + public String getCell(int x, int y) { + return String.valueOf(titvByQuality[y]); + } + + public String toString() { + String returnString = ""; + // output the ti/tv array + returnString += "titvByQuality: "; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString += titvByQuality[iii] + " "; + } + return returnString; + } + + public void incrValue( final double qual, final boolean _isTransition ) { + qualities.add(qual); + isTransition.add(_isTransition); + } + + public void organizeTiTvTables() { + for( int iii = 0; iii < NUM_BINS; iii++ ) { + transitionByQuality[iii] = 0L; + transversionByQuality[iii] = 0L; + titvByQuality[iii] = 0.0; + } + + double maxQual = 0.0; + + // Calculate the maximum quality score in order to normalize and histogram + for( final Double qual : qualities ) { + if( qual > maxQual ) { + maxQual = qual; + } + } + + final double binSize = maxQual / ((double) (NUM_BINS-1)); + + int jjj = 0; + for( final Double qual : qualities ) { + final int index = (int)Math.floor( qual / binSize ); + if(isTransition.get(jjj)) { + transitionByQuality[index]++; + } else { + transversionByQuality[index]++; + } + jjj++; + } + + for( int iii = 0; iii < NUM_BINS; iii++ ) { + if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio + titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); + } else { + titvByQuality[iii] = 0.0; + } + } + + } + } + */ + public VariantQualityScore(VariantEval2Walker parent) { // don't do anything } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java index 8cb6d7f49..fc681fc59 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -53,7 +53,7 @@ public class ApplyVariantClustersWalker extends RodWalker annotationLines = new ExpandingArrayList(); final ExpandingArrayList clusterLines = new ExpandingArrayList(); @@ -583,6 +583,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final int numVariants = data.length; final boolean[] markedVariant = new boolean[numVariants]; + final double MAX_QUAL = 100.0; + final double QUAL_STEP = 0.1; + final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1); + + final int numKnownAtCut[] = new int[NUM_BINS]; + final int numNovelAtCut[] = new int[NUM_BINS]; + final double knownTiTvAtCut[] = new double[NUM_BINS]; + final double novelTiTvAtCut[] = new double[NUM_BINS]; + final double theCut[] = new double[NUM_BINS]; + for( int iii = 0; iii < numVariants; iii++ ) { markedVariant[iii] = false; } @@ -601,11 +611,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel int numNovelTi = 0; int numNovelTv = 0; boolean foundDesiredNumVariants = false; + int jjj = 0; outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV"); - for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) { + for( double qCut = MAX_QUAL; qCut >= 0.0; qCut -= QUAL_STEP ) { for( int iii = 0; iii < numVariants; iii++ ) { if( !markedVariant[iii] ) { - if( data[iii].qual >= pCut ) { + if( data[iii].qual >= qCut ) { markedVariant[iii] = true; if( data[iii].isKnown ) { // known numKnown++; @@ -626,7 +637,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) { - System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",pCut) + " results in a filtered set with: " ); + System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " ); System.out.println("\t" + numKnown + " known variants"); System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)"); System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv))); @@ -634,9 +645,55 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel System.out.println(); foundDesiredNumVariants = true; } - outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + + outputFile.println( qCut + "," + numKnown + "," + numNovel + "," + ( numKnownTi == 0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," + - ( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ))); + ( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ) )); + + numKnownAtCut[jjj] = numKnown; + numNovelAtCut[jjj] = numNovel; + knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) ); + novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) ); + theCut[jjj] = qCut; + jjj++; + } + + // loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio + int checkQuantile = 0; + for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) { + boolean foundCut = false; + if( checkQuantile == 0 ) { + if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) { + foundCut = true; + checkQuantile++; + } + } else if( checkQuantile == 1 ) { + if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) { + foundCut = true; + checkQuantile++; + } + } else if( checkQuantile == 2 ) { + if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) { + foundCut = true; + checkQuantile++; + } + } else if( checkQuantile == 3 ) { + if( novelTiTvAtCut[jjj] >= targetTITV ) { + foundCut = true; + checkQuantile++; + } + } else if( checkQuantile == 4 ) { + break; // break out + } + + if( foundCut ) { + System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " ); + System.out.println("\t" + numKnownAtCut[jjj] + " known variants"); + System.out.println("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " + + String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)"); + System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj])); + System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj])); + System.out.println(); + } } outputFile.close();