ApplyVariantClusters now outputs interesting threshold points based on hitting the target novel TiTv

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3126 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-04-06 19:47:29 +00:00
parent 60c227d67f
commit 7b44e6bd55
3 changed files with 154 additions and 9 deletions

View File

@ -9,6 +9,7 @@ import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
import org.broadinstitute.sting.playground.utils.report.utils.TableType;
import java.util.ArrayList;
import java.util.HashMap;
/*
* Copyright (c) 2010 The Broad Institute
@ -47,6 +48,9 @@ public class VariantQualityScore extends VariantEvaluator {
@DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality")
TiTvStats titvStats = null;
//@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
//AlleleCountStats alleleCountStats = null;
class TiTvStats implements TableType {
final int NUM_BINS = 20;
final ArrayList<Double> qualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
@ -130,6 +134,90 @@ public class VariantQualityScore extends VariantEvaluator {
}
}
/*
class AlleleCountStats implements TableType {
final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
public Object[] getRowKeys() {
return new String[]{"sample"};
}
public Object[] getColumnKeys() {
final int NUM_BINS = qualityListMap.keySet().size();
final String columnKeys[] = new String[NUM_BINS];
int iii = 0;
for( final Integer key : qualityListMap.keySet() ) {
columnKeys[iii] = "AC" + key;
iii++;
}
return columnKeys;
}
public String getName() {
return "TiTvStats";
}
public String getCell(int x, int y) {
return String.valueOf(titvByQuality[y]);
}
public String toString() {
String returnString = "";
// output the ti/tv array
returnString += "titvByQuality: ";
for( int iii = 0; iii < NUM_BINS; iii++ ) {
returnString += titvByQuality[iii] + " ";
}
return returnString;
}
public void incrValue( final double qual, final boolean _isTransition ) {
qualities.add(qual);
isTransition.add(_isTransition);
}
public void organizeTiTvTables() {
for( int iii = 0; iii < NUM_BINS; iii++ ) {
transitionByQuality[iii] = 0L;
transversionByQuality[iii] = 0L;
titvByQuality[iii] = 0.0;
}
double maxQual = 0.0;
// Calculate the maximum quality score in order to normalize and histogram
for( final Double qual : qualities ) {
if( qual > maxQual ) {
maxQual = qual;
}
}
final double binSize = maxQual / ((double) (NUM_BINS-1));
int jjj = 0;
for( final Double qual : qualities ) {
final int index = (int)Math.floor( qual / binSize );
if(isTransition.get(jjj)) {
transitionByQuality[index]++;
} else {
transversionByQuality[index]++;
}
jjj++;
}
for( int iii = 0; iii < NUM_BINS; iii++ ) {
if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
} else {
titvByQuality[iii] = 0.0;
}
}
}
}
*/
public VariantQualityScore(VariantEval2Walker parent) {
// don't do anything
}

View File

@ -53,7 +53,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=false)
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.0;
@ -99,7 +99,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME, BACKOFF_FACTOR );
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );

View File

@ -85,8 +85,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
knownAlphaFactor = _knownAlphaFactor;
}
public VariantGaussianMixtureModel( final String clusterFileName, final double backOffGaussianFactor ) {
super( 0.0 );
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
super( _targetTITV );
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
@ -583,6 +583,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numVariants = data.length;
final boolean[] markedVariant = new boolean[numVariants];
final double MAX_QUAL = 100.0;
final double QUAL_STEP = 0.1;
final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1);
final int numKnownAtCut[] = new int[NUM_BINS];
final int numNovelAtCut[] = new int[NUM_BINS];
final double knownTiTvAtCut[] = new double[NUM_BINS];
final double novelTiTvAtCut[] = new double[NUM_BINS];
final double theCut[] = new double[NUM_BINS];
for( int iii = 0; iii < numVariants; iii++ ) {
markedVariant[iii] = false;
}
@ -601,11 +611,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int numNovelTi = 0;
int numNovelTv = 0;
boolean foundDesiredNumVariants = false;
int jjj = 0;
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) {
for( double qCut = MAX_QUAL; qCut >= 0.0; qCut -= QUAL_STEP ) {
for( int iii = 0; iii < numVariants; iii++ ) {
if( !markedVariant[iii] ) {
if( data[iii].qual >= pCut ) {
if( data[iii].qual >= qCut ) {
markedVariant[iii] = true;
if( data[iii].isKnown ) { // known
numKnown++;
@ -626,7 +637,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",pCut) + " results in a filtered set with: " );
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " );
System.out.println("\t" + numKnown + " known variants");
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
@ -634,9 +645,55 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
System.out.println();
foundDesiredNumVariants = true;
}
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
outputFile.println( qCut + "," + numKnown + "," + numNovel + "," +
( numKnownTi == 0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ) ));
numKnownAtCut[jjj] = numKnown;
numNovelAtCut[jjj] = numNovel;
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
theCut[jjj] = qCut;
jjj++;
}
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
int checkQuantile = 0;
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
boolean foundCut = false;
if( checkQuantile == 0 ) {
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 1 ) {
if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 2 ) {
if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 3 ) {
if( novelTiTvAtCut[jjj] >= targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 4 ) {
break; // break out
}
if( foundCut ) {
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " );
System.out.println("\t" + numKnownAtCut[jjj] + " known variants");
System.out.println("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
System.out.println();
}
}
outputFile.close();