Removed the StingException when mkdir fails for Sendu in AnalyzeCovariates. Incremental updates to VariantOptimizer.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3013 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-03-16 19:45:02 +00:00
parent 2525ecaa43
commit 933823c8bc
7 changed files with 132 additions and 68 deletions

View File

@ -0,0 +1,33 @@
#!/broad/tools/apps/R-2.6.0/bin/Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
targetTITV = as.numeric(args[2])
data = read.table(input,sep=",",head=T)
maxVars = max(data$numKnown, data$numNovel)
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
outfile = paste(input, ".optimizationCurve.pdf", sep="")
pdf(outfile, height=7, width=8)
par(mar=c(4,4,1,4),cex=1.3)
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with p(true) > X",ylab="",ylim=c(minTITV,maxTITV),col="Blue",pch=20)
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
abline(h=targetTITV,lty=3,col="Blue")
axis(side=2,col="DarkBlue")
axis(side=1)
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
par(new=T)
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),col="Green",pch=20)
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
axis(side=4,col="DarkGreen")
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
legend("topright", c("Known","Novel"), col=c("Green","DarkGreen"), pch=c(20,20),cex=0.7)
dev.off()

View File

@ -84,7 +84,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
try { try {
Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR); Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR);
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException("Couldn't create directory: " + OUTPUT_DIR); System.out.println("Couldn't create directory: " + OUTPUT_DIR);
System.out.println("User is responsible for making sure the output directory exists.");
} }
if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; } if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; }
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.utils.ExpandingArrayList; import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream;
/* /*
* Copyright (c) 2010 The Broad Institute * Copyright (c) 2010 The Broad Institute
* *
@ -39,6 +41,7 @@ public class VariantDataManager {
public final int numVariants; public final int numVariants;
public final int numAnnotations; public final int numAnnotations;
public final double[] varianceVector; public final double[] varianceVector;
public final double[] meanVector;
public boolean isNormalized; public boolean isNormalized;
private final ExpandingArrayList<String> annotationKeys; private final ExpandingArrayList<String> annotationKeys;
@ -52,6 +55,7 @@ public class VariantDataManager {
if( numAnnotations <= 0 ) { if( numAnnotations <= 0 ) {
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" ); throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
} }
meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations]; varianceVector = new double[numAnnotations];
isNormalized = false; isNormalized = false;
annotationKeys = _annotationKeys; annotationKeys = _annotationKeys;
@ -68,7 +72,7 @@ public class VariantDataManager {
if( theSTD < 1E-8 ) { if( theSTD < 1E-8 ) {
throw new StingException("Zero variance is a problem: standard deviation = " + theSTD); throw new StingException("Zero variance is a problem: standard deviation = " + theSTD);
} }
meanVector[jjj] = theMean;
varianceVector[jjj] = theSTD * theSTD; varianceVector[jjj] = theSTD * theSTD;
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD; data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
@ -95,4 +99,10 @@ public class VariantDataManager {
return Math.sqrt( sum ); return Math.sqrt( sum );
} }
// public void printClusterFileHeader( PrintStream outputFile ) {
// for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
// outputFile.println("@ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
// }
// }
} }

View File

@ -36,5 +36,4 @@ public class VariantDatum {
public double[] originalAnnotations; public double[] originalAnnotations;
public boolean isTransition; public boolean isTransition;
public boolean isKnown; public boolean isKnown;
public boolean isFiltered;
} }

View File

@ -43,13 +43,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final double MIN_PROB = 1E-30; private final double MIN_PROB = 1E-30;
private final double MIN_SUM_PROB = 1E-20; private final double MIN_SUM_PROB = 1E-20;
private final double[][] mu; private final double[][] mu; // The means for the clusters
private final double[][] sigma; private final double[][] sigma; // The variances for the clusters, sigma is really sigma^2
private final double[] pCluster; private final double[] pCluster;
private final int[] numMaxClusterKnown; private final int[] numMaxClusterKnown;
private final int[] numMaxClusterNovel; private final int[] numMaxClusterNovel;
private final double[] clusterTITV; private final double[] clusterTITV;
private final double[] clusterTruePositiveRate; private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) { public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations ) {
super( _dataManager, _targetTITV ); super( _dataManager, _targetTITV );
@ -76,15 +76,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
VariantDatum[] data; VariantDatum[] data;
if( numNovel * 2 * 1.3 < dataManager.numVariants ) { // Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
data = new VariantDatum[numNovel*2]; // If there are almost as many novels as known, simply use all the variants
final int numSubset = (int)Math.floor(numNovel*2.5);
if( numSubset * 1.3 < dataManager.numVariants ) {
data = new VariantDatum[numSubset];
int iii = 0; int iii = 0;
for( final VariantDatum datum : dataManager.data ) { for( final VariantDatum datum : dataManager.data ) {
if( !datum.isKnown ) { if( !datum.isKnown ) {
data[iii++] = datum; data[iii++] = datum;
} }
} }
while( iii < numNovel*2 ) { // grab an equal number of known variants at random while( iii < numSubset ) { // grab an equal number of known variants at random
final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)];
if( datum.isKnown ) { if( datum.isKnown ) {
data[iii++] = datum; data[iii++] = datum;
@ -95,11 +98,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants..."); System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2*numNovel is so large compared to the full set) "); } if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
createClusters( data ); // Using a subset of the data createClusters( data ); // Using a subset of the data
System.out.println("Printing out cluster parameters..."); System.out.println("Printing out cluster parameters...");
printClusters( outputPrefix ); printClusters( outputPrefix );
System.out.println("Applying clusters to all the variants..."); System.out.println("Applying clusters to all variants...");
applyClusters( dataManager.data, outputPrefix ); // Using all the data applyClusters( dataManager.data, outputPrefix ); // Using all the data
} }
@ -108,7 +111,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numVariants = data.length; final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length; final int numAnnotations = data[0].annotations.length;
final double[][] pVarInCluster = new double[numGaussians][numVariants]; final double[][] pVarInCluster = new double[numGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian
final double[] probTi = new double[numGaussians]; final double[] probTi = new double[numGaussians];
final double[] probTv = new double[numGaussians]; final double[] probTv = new double[numGaussians];
@ -123,11 +126,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterKnown[kkk] = 0; numMaxClusterKnown[kkk] = 0;
numMaxClusterNovel[kkk] = 0; numMaxClusterNovel[kkk] = 0;
pCluster[kkk] = 1.0 / ((double) numGaussians); pCluster[kkk] = 1.0 / ((double) numGaussians);
//final double[] randMu = new double[numAnnotations]; mu[kkk] = data[rand.nextInt(numVariants)].annotations;
//for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
// randMu[jjj] = data[rand.nextInt(numVariants)].annotations[jjj];
//}
mu[kkk] = data[rand.nextInt(numVariants)].annotations; //randMu;
final double[] randSigma = new double[numAnnotations]; final double[] randSigma = new double[numAnnotations];
if( dataManager.isNormalized ) { if( dataManager.isNormalized ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
@ -141,6 +140,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sigma[kkk] = randSigma; sigma[kkk] = randSigma;
} }
// The EM loop
for( int ttt = 0; ttt < numIterations; ttt++ ) { for( int ttt = 0; ttt < numIterations; ttt++ ) {
///////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -165,6 +165,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
probTi[kkk] = 0.0; probTi[kkk] = 0.0;
probTv[kkk] = 0.0; probTv[kkk] = 0.0;
} }
// Use the cluster's probabilistic Ti/Tv ratio as the indication of the cluster's true positive rate
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
if( data[iii].isTransition ) { // transition if( data[iii].isTransition ) { // transition
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
@ -176,6 +177,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
// Calculate which cluster has the maximum probability for this variant for use as a metric of how well clustered the data is
double maxProb = pVarInCluster[0][iii]; double maxProb = pVarInCluster[0][iii];
int maxCluster = 0; int maxCluster = 0;
for( int kkk = 1; kkk < numGaussians; kkk++ ) { for( int kkk = 1; kkk < numGaussians; kkk++ ) {
@ -202,7 +204,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int clusterNumber = 0; int clusterNumber = 0;
final int numAnnotations = mu[0].length; final int numAnnotations = mu[0].length;
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) { //BUGBUG: only output the good clusters here to protect against over-fitting?
//if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 ) {
outputFile.print(clusterNumber + ","); outputFile.print(clusterNumber + ",");
outputFile.print(numMaxClusterKnown[kkk] + ","); outputFile.print(numMaxClusterKnown[kkk] + ",");
outputFile.print(numMaxClusterNovel[kkk] + ","); outputFile.print(numMaxClusterNovel[kkk] + ",");
@ -216,7 +219,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
outputFile.println(-1); outputFile.println(-1);
clusterNumber++; clusterNumber++;
} //}
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
@ -227,11 +230,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) { public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
final int numVariants = data.length; final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
final double[] pTrueVariant = new double[numVariants]; final double[] pTrueVariant = new double[numVariants];
final boolean[] markedVariant = new boolean[numVariants];
final double[] pVarInCluster = new double[numGaussians]; final double[] pVarInCluster = new double[numGaussians];
final int[] clusterAssignment = new int[numVariants];
///////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
@ -240,50 +242,51 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
evaluateGaussiansForSingleVariant( data[iii], pVarInCluster ); evaluateGaussiansForSingleVariant( data[iii], pVarInCluster );
pTrueVariant[iii] = 0.0; pTrueVariant[iii] = 0.0;
markedVariant[iii] = false;
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk]; pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
} }
double maxProb = -1.0;//pVarInCluster[0][iii];
int maxCluster = -1;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( numMaxClusterKnown[kkk] + numMaxClusterNovel[kkk] >= 2000 && pVarInCluster[kkk] > maxProb ) {
maxProb = pVarInCluster[kkk];
maxCluster = kkk;
}
}
clusterAssignment[iii] = maxCluster;
} }
PrintStream outputFile = null; PrintStream outputFile = null;
try { try {
outputFile = new PrintStream( outputPrefix + ".data" ); outputFile = new PrintStream( outputPrefix + ".dat" );
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
System.exit(-1); System.exit(-1);
} }
for(int iii = 0; iii < numVariants; iii++) {
outputFile.print(pTrueVariant[iii] + ",");
outputFile.print(clusterAssignment[iii] + ",");
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.print(data[iii].originalAnnotations[jjj] + ",");
}
outputFile.println( (data[iii].isTransition ? 1 : 0)
+ "," + (data[iii].isKnown? 1 : 0)
+ "," + (data[iii].isFiltered ? 1 : 0));
}
try { int numKnown = 0;
outputFile = new PrintStream( outputPrefix + ".optimize" ); int numNovel = 0;
} catch (Exception e) { int numKnownTi = 0;
e.printStackTrace(); int numKnownTv = 0;
System.exit(-1); int numNovelTi = 0;
} int numNovelTv = 0;
for(int iii = 0; iii < numVariants; iii++) { outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
outputFile.print(String.format("%.4f",pTrueVariant[iii]) + ","); for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
outputFile.println( (data[iii].isTransition ? 1 : 0) for( int iii = 0; iii < numVariants; iii++ ) {
+ "," + (data[iii].isKnown? 1 : 0) if( !markedVariant[iii] ) {
+ "," + (data[iii].isFiltered ? 1 : 0)); if( pTrueVariant[iii] >= pCut ) {
markedVariant[iii] = true;
if( data[iii].isKnown ) { // known
numKnown++;
if( data[iii].isTransition ) { // transition
numKnownTi++;
} else { // transversion
numKnownTv++;
}
} else { // novel
numNovel++;
if( data[iii].isTransition ) { // transition
numNovelTi++;
} else { // transversion
numNovelTv++;
}
}
}
}
}
outputFile.println( pCut + "," + numKnown + "," + numNovel + "," + ( ((double)numKnownTi) / ((double)numKnownTv) ) + "," + ( ((double)numNovelTi) / ((double)numNovelTv) ));
} }
} }
@ -324,6 +327,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sumProb = 0.0; double sumProb = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
//BUGBUG: only use the good clusters here to protect against over-fitting?
double sum = 0.0; double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) ) sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
@ -398,7 +402,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Clean up extra big or extra small clusters // Clean up extra big or extra small clusters
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
System.out.println("Very large cluster!");
final int numToReplace = 4; final int numToReplace = 4;
final double[] savedSigma = sigma[kkk]; final double[] savedSigma = sigma[kkk];
for( int rrr = 0; rrr < numToReplace; rrr++ ) { for( int rrr = 0; rrr < numToReplace; rrr++ ) {
@ -445,15 +448,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
// Replace small clusters with another random draw from the dataset
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( pCluster[kkk] < 0.05 * (1.0 / ((double) numGaussians)) ) { // This is a very small cluster compared to all the others if( pCluster[kkk] < 0.05 * (1.0 / ((double) numGaussians)) ) { // This is a very small cluster compared to all the others
System.out.println("Very small cluster!");
pCluster[kkk] = 1.0 / ((double) numGaussians); pCluster[kkk] = 1.0 / ((double) numGaussians);
mu[kkk] = data[rand.nextInt(numVariants)].annotations; mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[] randSigma = new double[numAnnotations]; final double[] randSigma = new double[numAnnotations];
if( dataManager.isNormalized ) { if( dataManager.isNormalized ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
randSigma[jjj] = 0.6 + 0.4 * rand.nextDouble(); // Explore a wider range randSigma[jjj] = 0.6 + 0.4 * rand.nextDouble(); // Explore a wider range of possible sigma values since we are tossing out clusters anyway
} }
} else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer } else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream; import java.io.PrintStream;
/* /*
@ -44,6 +46,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
public void run( final String outputPrefix ) { public void run( final String outputPrefix ) {
throw new StingException( "Nearest Neighbors model hasn't been updated yet." );
/*
final int numVariants = dataManager.numVariants; final int numVariants = dataManager.numVariants;
final double[] pTrueVariant = new double[numVariants]; final double[] pTrueVariant = new double[numVariants];
@ -67,8 +71,8 @@ public final class VariantNearestNeighborsModel extends VariantOptimizationModel
for(int iii = 0; iii < numVariants; iii++) { for(int iii = 0; iii < numVariants; iii++) {
outputFile.print(String.format("%.4f",pTrueVariant[iii]) + ","); outputFile.print(String.format("%.4f",pTrueVariant[iii]) + ",");
outputFile.println( (dataManager.data[iii].isTransition ? 1 : 0) outputFile.println( (dataManager.data[iii].isTransition ? 1 : 0)
+ "," + (dataManager.data[iii].isKnown? 1 : 0) + "," + (dataManager.data[iii].isKnown? 1 : 0));
+ "," + (dataManager.data[iii].isFiltered ? 1 : 0) );
} }
*/
} }
} }

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.IOException;
/* /*
* Copyright (c) 2010 The Broad Institute * Copyright (c) 2010 The Broad Institute
* *
@ -51,17 +53,15 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
// Command Line Arguments // Command Line Arguments
///////////////////////////// /////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true) @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.12; private double TARGET_TITV = 2.1;
//@Argument(fullName="filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false)
//private boolean FILTER_OUTPUT = false;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_INPUT_FILTERS = false; private boolean IGNORE_INPUT_FILTERS = false;
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false) @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
private String[] EXCLUDED_ANNOTATIONS = null; private String[] EXCLUDED_ANNOTATIONS = null;
@Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false) @Argument(fullName="force_annotation", shortName="force", doc="The names of the annotations which should be forced into the calculations even if they aren't present in every variant", required=false)
private String[] FORCED_ANNOTATIONS = null; private String[] FORCED_ANNOTATIONS = null;
@Argument(fullName="output", shortName="output", doc="The output file name", required=false) @Argument(fullName="output_prefix", shortName="output", doc="The output prefix added to output cluster file name and optimization curve pdf file name", required=false)
private String OUTPUT_FILE = "optimizer.data"; private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false) @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
private int NUM_GAUSSIANS = 32; private int NUM_GAUSSIANS = 32;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false) @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
@ -70,7 +70,10 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
private int NUM_KNN = 2000; private int NUM_KNN = 2000;
@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) @Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL; private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
@ -87,7 +90,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public void initialize() { public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -138,7 +141,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE; value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
} }
} catch( NumberFormatException e ) { } catch( NumberFormatException e ) {
// do nothing, default value is 0.0, // do nothing, default value is 0.0
} }
annotationValues[iii++] = value; annotationValues[iii++] = value;
} }
@ -150,7 +153,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
variantDatum.annotations = annotationValues; variantDatum.annotations = annotationValues;
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !vc.getAttribute("ID").equals("."); variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
variantDatum.isFiltered = vc.isFiltered(); // BUGBUG: This field won't be needed in the final version.
mapList.add( variantDatum ); mapList.add( variantDatum );
} }
} }
@ -195,8 +197,20 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
default: default:
throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS"); throw new StingException("Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS");
} }
theModel.run( OUTPUT_PREFIX );
theModel.run( OUTPUT_FILE ); // Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
} }
} }