ApplyVariantClusters now outputs interesting threshold points based on hitting the target novel TiTv
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3126 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
60c227d67f
commit
7b44e6bd55
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.playground.utils.report.tags.DataPoint;
|
|||
import org.broadinstitute.sting.playground.utils.report.utils.TableType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -47,6 +48,9 @@ public class VariantQualityScore extends VariantEvaluator {
|
|||
@DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality")
|
||||
TiTvStats titvStats = null;
|
||||
|
||||
//@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
|
||||
//AlleleCountStats alleleCountStats = null;
|
||||
|
||||
class TiTvStats implements TableType {
|
||||
final int NUM_BINS = 20;
|
||||
final ArrayList<Double> qualities = new ArrayList<Double>(); // An ArrayList holds all the qualities until we are able to bin them appropriately
|
||||
|
|
@ -130,6 +134,90 @@ public class VariantQualityScore extends VariantEvaluator {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
class AlleleCountStats implements TableType {
|
||||
final HashMap<Integer, ArrayList<Double>> qualityListMap = new HashMap<Integer, ArrayList<Double>>();
|
||||
final HashMap<Integer, Double> qualityMap = new HashMap<Integer, Double>();
|
||||
|
||||
public Object[] getRowKeys() {
|
||||
return new String[]{"sample"};
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
final int NUM_BINS = qualityListMap.keySet().size();
|
||||
final String columnKeys[] = new String[NUM_BINS];
|
||||
int iii = 0;
|
||||
for( final Integer key : qualityListMap.keySet() ) {
|
||||
columnKeys[iii] = "AC" + key;
|
||||
iii++;
|
||||
}
|
||||
return columnKeys;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "TiTvStats";
|
||||
}
|
||||
|
||||
public String getCell(int x, int y) {
|
||||
return String.valueOf(titvByQuality[y]);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String returnString = "";
|
||||
// output the ti/tv array
|
||||
returnString += "titvByQuality: ";
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
returnString += titvByQuality[iii] + " ";
|
||||
}
|
||||
return returnString;
|
||||
}
|
||||
|
||||
public void incrValue( final double qual, final boolean _isTransition ) {
|
||||
qualities.add(qual);
|
||||
isTransition.add(_isTransition);
|
||||
}
|
||||
|
||||
public void organizeTiTvTables() {
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
transitionByQuality[iii] = 0L;
|
||||
transversionByQuality[iii] = 0L;
|
||||
titvByQuality[iii] = 0.0;
|
||||
}
|
||||
|
||||
double maxQual = 0.0;
|
||||
|
||||
// Calculate the maximum quality score in order to normalize and histogram
|
||||
for( final Double qual : qualities ) {
|
||||
if( qual > maxQual ) {
|
||||
maxQual = qual;
|
||||
}
|
||||
}
|
||||
|
||||
final double binSize = maxQual / ((double) (NUM_BINS-1));
|
||||
|
||||
int jjj = 0;
|
||||
for( final Double qual : qualities ) {
|
||||
final int index = (int)Math.floor( qual / binSize );
|
||||
if(isTransition.get(jjj)) {
|
||||
transitionByQuality[index]++;
|
||||
} else {
|
||||
transversionByQuality[index]++;
|
||||
}
|
||||
jjj++;
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio
|
||||
titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]);
|
||||
} else {
|
||||
titvByQuality[iii] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public VariantQualityScore(VariantEval2Walker parent) {
|
||||
// don't do anything
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
|||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=false)
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.0;
|
||||
|
|
@ -99,7 +99,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
|||
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME, BACKOFF_FACTOR );
|
||||
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
|
|
|
|||
|
|
@ -85,8 +85,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
knownAlphaFactor = _knownAlphaFactor;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final String clusterFileName, final double backOffGaussianFactor ) {
|
||||
super( 0.0 );
|
||||
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
|
||||
super( _targetTITV );
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
||||
|
|
@ -583,6 +583,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
final int numVariants = data.length;
|
||||
final boolean[] markedVariant = new boolean[numVariants];
|
||||
|
||||
final double MAX_QUAL = 100.0;
|
||||
final double QUAL_STEP = 0.1;
|
||||
final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1);
|
||||
|
||||
final int numKnownAtCut[] = new int[NUM_BINS];
|
||||
final int numNovelAtCut[] = new int[NUM_BINS];
|
||||
final double knownTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double novelTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double theCut[] = new double[NUM_BINS];
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
markedVariant[iii] = false;
|
||||
}
|
||||
|
|
@ -601,11 +611,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
int numNovelTi = 0;
|
||||
int numNovelTv = 0;
|
||||
boolean foundDesiredNumVariants = false;
|
||||
int jjj = 0;
|
||||
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||
for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) {
|
||||
for( double qCut = MAX_QUAL; qCut >= 0.0; qCut -= QUAL_STEP ) {
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( !markedVariant[iii] ) {
|
||||
if( data[iii].qual >= pCut ) {
|
||||
if( data[iii].qual >= qCut ) {
|
||||
markedVariant[iii] = true;
|
||||
if( data[iii].isKnown ) { // known
|
||||
numKnown++;
|
||||
|
|
@ -626,7 +637,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",pCut) + " results in a filtered set with: " );
|
||||
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " );
|
||||
System.out.println("\t" + numKnown + " known variants");
|
||||
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||
|
|
@ -634,9 +645,55 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
System.out.println();
|
||||
foundDesiredNumVariants = true;
|
||||
}
|
||||
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," +
|
||||
outputFile.println( qCut + "," + numKnown + "," + numNovel + "," +
|
||||
( numKnownTi == 0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
|
||||
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
|
||||
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ) ));
|
||||
|
||||
numKnownAtCut[jjj] = numKnown;
|
||||
numNovelAtCut[jjj] = numNovel;
|
||||
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
|
||||
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
|
||||
theCut[jjj] = qCut;
|
||||
jjj++;
|
||||
}
|
||||
|
||||
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
|
||||
int checkQuantile = 0;
|
||||
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
|
||||
boolean foundCut = false;
|
||||
if( checkQuantile == 0 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 1 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 2 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 3 ) {
|
||||
if( novelTiTvAtCut[jjj] >= targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 4 ) {
|
||||
break; // break out
|
||||
}
|
||||
|
||||
if( foundCut ) {
|
||||
System.out.println( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " );
|
||||
System.out.println("\t" + numKnownAtCut[jjj] + " known variants");
|
||||
System.out.println("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
|
||||
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
|
||||
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
|
||||
System.out.println("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
outputFile.close();
|
||||
|
|
|
|||
Loading…
Reference in New Issue