Major bug fixes for the Variant Recalibrator. Covariance matrix values are now allowed to be negative. When probabilities are multiplied together the calculation is done in log space, normalized, then converted back to real valued probabilities. Clustering weights have been changed to only use HapMap and by-1000genomes sites. The -nI argument was removed and now clustering simply runs until convergence. Test cases seem to work best when using just two annotations (QD and SB). More changes are in the works and are being evaluated. Misc fixes to walkers that use RScript due to CentOS changes.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3590 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-06-18 17:37:11 +00:00
parent c3434493b0
commit 724affc3cc
8 changed files with 281 additions and 220 deletions

View File

@ -55,8 +55,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
private String OUTPUT_DIR = "analyzeCovariates/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)

View File

@ -644,7 +644,7 @@ public class GenomeAnalysisEngine {
for (RMDTrack track : tracks) {
SAMSequenceDictionary trackDict = track.getSequenceDictionary();
if (trackDict == null) {
logger.info("Track " + track.getName() + "doesn't have a sequence dictionary built in, skipping dictionary validation");
logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation");
continue;
}
Set<String> trackSequences = new TreeSet<String>();

View File

@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
/**
@ -65,23 +66,32 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false)
private int NUM_GAUSSIANS = 6;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed when clustering", required=false)
private int NUM_ITERATIONS = 10;
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int NUM_GAUSSIANS = 4;
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
private int MAX_ITERATIONS = 200;
//@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int MIN_VAR_IN_CLUSTER = 0;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
private double WEIGHT_NOVELS = 0.0;
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false)
private double WEIGHT_KNOWNS = 8.0;
private double WEIGHT_KNOWNS = 0.0;
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
private double WEIGHT_HAPMAP = 120.0;
private double WEIGHT_HAPMAP = 100.0;
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
private double WEIGHT_1000GENOMES = 12.0;
private double WEIGHT_1000GENOMES = 1.0;
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
private double WEIGHT_MQ1 = 10.0;
private double WEIGHT_MQ1 = 0.0;
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
private boolean FORCE_INDEPENDENT = false;
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
private double STD_THRESHOLD = 6.0;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
private double QUAL_THRESHOLD = 300.0;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
@ -94,6 +104,8 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
private ExpandingArrayList<String> annotationKeys;
private Set<String> ignoreInputFilterSet = null;
private int maxAC = 0;
private PrintStream outFile;
private final static boolean FXYZ_FILE = false; // Debug argument
//---------------------------------------------------------------------------------------------------------------
//
@ -122,6 +134,14 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
if(!foundDBSNP) {
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
}
if(FXYZ_FILE) {
try {
outFile = new PrintStream("variants.filtered.xyz");
} catch(Exception e) {
throw new StingException("Can't create file!");
}
}
}
//---------------------------------------------------------------------------------------------------------------
@ -158,15 +178,19 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
}
variantDatum.isKnown = false;
variantDatum.weight = 1.0;
variantDatum.weight = WEIGHT_NOVELS;
variantDatum.knownStatus = VariantDatum.NOVEL;
variantDatum.qual = vc.getPhredScaledQual();
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
if( dbsnp != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_KNOWNS;
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
variantDatum.knownStatus = VariantDatum.KNOWN_MQ0;
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; variantDatum.knownStatus = VariantDatum.KNOWN_HAPMAP; }
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; variantDatum.knownStatus = VariantDatum.KNOWN_1KG; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; variantDatum.knownStatus = VariantDatum.KNOWN_MQ1; }
}
mapList.add( variantDatum );
@ -201,12 +225,19 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
logger.info( "The annotations are: " + annotationKeys );
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
if(FXYZ_FILE) { // Debug output
for(final VariantDatum datum : dataManager.data) {
outFile.println(String.format("%f,%f,%f,%d,%f",datum.annotations[0], datum.annotations[1], datum.weight, datum.knownStatus, datum.qual));
}
outFile.close();
}
// Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
VariantGaussianMixtureModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC );
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, MAX_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC, FORCE_INDEPENDENT, STD_THRESHOLD, QUAL_THRESHOLD );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -222,7 +253,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
System.out.println( rScriptCommandLine );
logger.info( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {

View File

@ -38,4 +38,11 @@ public class VariantDatum {
public double qual;
public double weight;
public int alleleCount;
public int knownStatus;
public final static int NOVEL = -1;
public final static int KNOWN_MQ0 = 0;
public final static int KNOWN_MQ1 = 1;
public final static int KNOWN_1KG = 2;
public final static int KNOWN_HAPMAP = 3;
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.text.XReadLines;
@ -51,36 +52,43 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
public final VariantDataManager dataManager;
private final int numGaussians;
private final int numIterations;
private final int maxIterations;
private final static long RANDOM_SEED = 91801305;
private final static Random rand = new Random( RANDOM_SEED );
private final double MIN_PROB = 1E-7;
private final double MIN_SIGMA = 1E-5;
private final double MIN_DETERMINANT = 1E-5;
private final double MIN_SIGMA = 1E-10;
private final double MIN_DETERMINANT = 1E-3;
private final double MIN_PROB_CONVERGENCE = 1E-5;
private final boolean FORCE_INDEPENDENT_ANNOTATIONS;
private final double[][] mu; // The means for each cluster
private final Matrix[] sigma; // The covariance matrix for each cluster
private final Matrix[] sigmaInverse;
private final double[] pCluster;
private double[] pClusterLog10;
private final double[] determinant;
private final double[] alleleCountFactorArray;
private final int minVarInCluster;
private final double stdThreshold;
private final double qualThreshold;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final int maxAC ) {
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _maxIterations, final int _minVarInCluster,
final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold ) {
dataManager = _dataManager;
numGaussians = _numGaussians;
numIterations = _numIterations;
maxIterations = _maxIterations;
mu = new double[numGaussians][];
sigma = new Matrix[numGaussians];
determinant = new double[numGaussians];
pCluster = new double[numGaussians];
pClusterLog10 = new double[numGaussians];
alleleCountFactorArray = new double[maxAC + 1];
minVarInCluster = _minVarInCluster;
stdThreshold = _stdThreshold;
qualThreshold = _qualThreshold;
FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
sigmaInverse = null; // This field isn't used during VariantOptimizer pass
}
@ -108,16 +116,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
dataManager = new VariantDataManager( annotationLines );
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters
numIterations = 0;
maxIterations = 0;
minVarInCluster = 0;
stdThreshold = 0.0;
qualThreshold = 0.0;
FORCE_INDEPENDENT_ANNOTATIONS = false;
// BUGBUG: move this parsing out of the constructor
numGaussians = clusterLines.size();
mu = new double[numGaussians][dataManager.numAnnotations];
double sigmaVals[][][] = new double[numGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
final double sigmaVals[][][] = new double[numGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
sigma = new Matrix[numGaussians];
sigmaInverse = new Matrix[numGaussians];
pCluster = new double[numGaussians];
pClusterLog10 = new double[numGaussians];
determinant = new double[numGaussians];
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
@ -129,7 +140,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int kkk = 0;
for( final String line : clusterLines ) {
final String[] vals = line.split(",");
pCluster[kkk] = Double.parseDouble(vals[1]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
pClusterLog10[kkk] = Math.log10( Double.parseDouble(vals[1]) ); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]);
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
@ -143,7 +154,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
kkk++;
}
logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
logger.info("Found " + numGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
}
public final void run( final String clusterFileName ) {
@ -151,9 +162,68 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Initialize the Allele Count prior
generateAlleleCountPrior();
int numValid = 0;
int numOutlier = 0;
int numBadQual = 0;
int numZeroWeight = 0;
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value or having low quality score
for( final VariantDatum datum : dataManager.data ) {
boolean goodVar = true;
if(!(datum.weight > 0.0)) {
goodVar = false;
numZeroWeight++;
}
if(goodVar) {
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
numOutlier++;
break;
}
}
}
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
numBadQual++;
}
}
if(goodVar) { numValid++; }
}
final VariantDatum data[] = new VariantDatum[numValid];
int iii = 0;
for( final VariantDatum datum : dataManager.data ) {
boolean goodVar = true;
if(!(datum.weight > 0.0)) {
goodVar = false;
}
if(goodVar) {
for( final double val : datum.annotations ) {
if( Math.abs(val) > stdThreshold ) {
goodVar = false;
break;
}
}
}
if(goodVar) {
if( datum.qual < qualThreshold ) {
goodVar = false;
}
}
if(goodVar) { data[iii++] = datum; }
}
logger.info("Clustering with " + data.length + " valid variants.");
logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
createClusters( data, 0, numGaussians, clusterFileName );
// Simply cluster with all the variants. The knowns have been given more weight than the novels
logger.info("Clustering with " + dataManager.data.length + " variants.");
createClusters( dataManager.data, 0, numGaussians, clusterFileName, false );
//logger.info("Clustering with " + dataManager.data.length + " variants.");
//createClusters( dataManager.data, 0, numGaussians, clusterFileName );
}
private void generateAlleleCountPrior() {
@ -169,13 +239,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
alleleCount[iii] = 0;
alleleCount[iii] = 1; // Start off with one count to smooth the estimate
}
for( final VariantDatum datum : dataManager.data ) {
alleleCount[datum.alleleCount]++;
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution
acActual[iii] = ((double)alleleCount[iii]) / ((double) (dataManager.data.length+(alleleCountFactorArray.length-1))); // Turn acActual into a probability distribution
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
@ -186,7 +256,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return alleleCountFactorArray[alleleCount];
}
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
@ -202,15 +272,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Set up the initial random Gaussians
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] = 1.0 / ((double) (stopCluster - startCluster));
pClusterLog10[kkk] = Math.log10(1.0 / ((double) (stopCluster - startCluster)));
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[][] randSigma = new double[numAnnotations][numAnnotations];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
randSigma[ppp][jjj] = 0.5 + 0.5 * rand.nextDouble(); // data has been normalized so sigmas are centered at 1.0
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
randSigma[ppp][jjj] = 0.55 + 1.25 * rand.nextDouble();
if(rand.nextBoolean()) {
randSigma[ppp][jjj] *= -1.0;
}
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying by its transpose
}
}
if( FORCE_INDEPENDENT_ANNOTATIONS ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
if(jjj!=ppp) {
randSigma[jjj][ppp] = 0.0;
}
}
}
}
Matrix tmp = new Matrix(randSigma);
tmp = tmp.times(tmp.transpose());
sigma[kkk] = tmp;
@ -218,12 +301,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
// The EM loop
for( int ttt = 0; ttt < numIterations; ttt++ ) {
double previousLikelihood = -1E20;
double currentLikelihood;
int ttt = 1;
while( ttt < maxIterations ) {
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Expectation Step (calculate the probability that each data point is in each cluster)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
evaluateGaussians( data, pVarInCluster, startCluster, stopCluster );
currentLikelihood = evaluateGaussians( data, pVarInCluster, startCluster, stopCluster );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Maximization Step (move the clusters to maximize the sum probability of each data point)
@ -233,9 +319,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Output cluster parameters at each iteration
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
printClusterParameters( clusterFileName + "." + (ttt+1) );
printClusterParameters( clusterFileName + "." + ttt );
logger.info("Finished iteration " + (ttt+1) );
logger.info("Finished iteration " + ttt );
ttt++;
if( currentLikelihood - previousLikelihood < MIN_PROB_CONVERGENCE) {
logger.info("Convergence!");
break;
}
previousLikelihood = currentLikelihood;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -256,10 +348,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numAnnotations = mu[0].length;
final int numVariants = dataManager.numVariants;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( pCluster[kkk] * numVariants > minVarInCluster ) {
if( Math.pow(10.0, pClusterLog10[kkk]) * numVariants > minVarInCluster ) {
final double sigmaVals[][] = sigma[kkk].getArray();
outputFile.print("@!CLUSTER,");
outputFile.print(pCluster[kkk] + ",");
outputFile.print(Math.pow(10.0, pClusterLog10[kkk]) + ",");
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.print(mu[kkk][jjj] + ",");
}
@ -295,7 +387,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return value;
}
public final double evaluateVariant( final VariantContext vc ) {
public final double evaluateVariantLog10( final VariantContext vc ) {
final double[] pVarInCluster = new double[numGaussians];
final double[] annotations = new double[dataManager.numAnnotations];
@ -310,7 +402,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
}
return sum;
double sumLog10 = Math.log10(sum);
if( Double.isInfinite(sumLog10) || Double.isNaN(sumLog10) ) {
//logger.warn("pTrueLog10 = -Infinity, capped at -20");
sumLog10 = -20;
}
return sumLog10;
}
public final void outputClusterReports( final String outputPrefix ) {
@ -330,7 +428,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
for( VariantDatum datum : dataManager.data ) {
for( final VariantDatum datum : dataManager.data ) {
final int isKnown = ( datum.isKnown ? 1 : 0 );
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP));
@ -368,7 +466,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix,
final int desiredNumVariants, final Double[] FDRtranches, double QUAL_STEP ) {
final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) {
final int numVariants = data.length;
final boolean[] markedVariant = new boolean[numVariants];
@ -510,19 +608,27 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
final int numAnnotations = data[0].annotations.length;
double likelihood = 0.0;
final double sigmaVals[][][] = new double[numGaussians][][];
final double denom[] = new double[numGaussians];
final double denomLog10[] = new double[numGaussians];
final double pVarInClusterLog10[] = new double[numGaussians];
double pVarInClusterReals[];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
sigmaVals[kkk] = sigma[kkk].inverse().getArray();
denom[kkk] = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(Math.abs(determinant[kkk]), 0.5);
denomLog10[kkk] = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5));
if( Double.isInfinite(denomLog10[kkk]) ) {
throw new StingException("Numerical Instability! Determinant value is too small: " + determinant[kkk] +
"Try running with fewer annotations and then with fewer Gaussians.");
}
}
final double mult[] = new double[numAnnotations];
double sumWeight = 0.0;
for( int iii = 0; iii < data.length; iii++ ) {
double sumProb = 0.0;
sumWeight += data[iii].weight;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
@ -535,47 +641,40 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]);
}
pVarInCluster[kkk][iii] = pCluster[kkk] * (Math.exp( -0.5 * sum ) / denom[kkk]);
likelihood += pVarInCluster[kkk][iii];
if(Double.isNaN(denom[kkk]) || determinant[kkk] < 0.5 * MIN_DETERMINANT) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! determinant of covariance matrix <= 0. Try running with fewer clusters and then with better behaved annotation values.");
}
if(sum < 0.0) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! covariance matrix no longer positive definite. Try running with fewer clusters and then with better behaved annotation values.");
}
if(pVarInCluster[kkk][iii] > 1.0) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values.");
}
pVarInClusterLog10[kkk] = pClusterLog10[kkk] + (( -0.5 * sum )/Math.log(10.0)) - denomLog10[kkk];
final double pVar = Math.pow(10.0, pVarInClusterLog10[kkk]);
likelihood += pVar * data[iii].weight;
if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem
pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble();
}
if( pVarInClusterLog10[kkk] > 0.0 || Double.isNaN(pVarInClusterLog10[kkk]) || Double.isInfinite(pVarInClusterLog10[kkk]) ) {
logger.warn("det = " + sigma[kkk].det());
logger.warn("denom = " + denomLog10[kkk]);
logger.warn("sumExp = " + sum);
logger.warn("pVar = " + pVar);
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
logger.warn(sigmaVals[kkk][ppp][jjj]);
}
}
sumProb += pVarInCluster[kkk][iii];
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
throw new StingException("Numerical Instability! Found NaN after performing log10: " + pVarInClusterLog10[kkk] + ", cluster = " + kkk + ", variant index = " + iii);
}
}
pVarInClusterReals = MathUtils.normalizeFromLog10( pVarInClusterLog10 );
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pVarInCluster[kkk][iii] /= sumProb;
pVarInCluster[kkk][iii] *= data[iii].weight;
pVarInCluster[kkk][iii] = pVarInClusterReals[kkk] * data[iii].weight;
if( Double.isNaN(pVarInCluster[kkk][iii]) ) {
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
throw new StingException("Numerical Instability! Found NaN after rescaling log10 values: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii);
}
}
}
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length));
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / sumWeight));
return likelihood / sumWeight;
}
@ -596,8 +695,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
}
final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5);
pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom;
final double denomLog10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5));
pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + (( -0.5 * sum ) / Math.log(10.0)) - denomLog10);
}
}
@ -607,10 +706,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
final double sigmaVals[][][] = new double[numGaussians][numAnnotations][numAnnotations];
final double meanVals[][] = new double[numGaussians][numAnnotations];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = 0.0;
meanVals[kkk][jjj] = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] = 0.0;
}
@ -621,29 +721,34 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sumProb = 0.0;
for( int iii = 0; iii < numVariants; iii++ ) {
final double prob = pVarInCluster[kkk][iii];
if( Double.isNaN(prob) ) {
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
throw new StingException("Numerical Instability! Found NaN in M-step: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii);
}
sumProb += prob;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] += prob * data[iii].annotations[jjj];
meanVals[kkk][jjj] += prob * data[iii].annotations[jjj];
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] /= sumProb;
meanVals[kkk][jjj] /= sumProb;
}
for( int iii = 0; iii < numVariants; iii++ ) {
final double prob = pVarInCluster[kkk][iii];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[ppp]-mu[kkk][ppp]);
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-meanVals[kkk][jjj]) * (data[iii].annotations[ppp]-meanVals[kkk][ppp]);
}
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA ) { // Very small numbers are a very big problem
sigmaVals[kkk][jjj][ppp] = MIN_SIGMA;// + MIN_SIGMA * rand.nextDouble();
if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA && sigmaVals[kkk][jjj][ppp] > -MIN_SIGMA ) { // Very small numbers are a very big problem
logger.warn("The sigma values look exceptionally small.... Probably about to crash due to numeric instability.");
}
sigmaVals[kkk][ppp][jjj] = sigmaVals[kkk][jjj][ppp]; // sigma must be a symmetric matrix
}
@ -651,103 +756,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] /= sumProb;
sigmaVals[kkk][jjj][ppp] /= sumProb;
}
}
sigma[kkk] = new Matrix(sigmaVals[kkk]);
determinant[kkk] = sigma[kkk].det();
pCluster[kkk] = sumProb / numVariants;
sumPK += pCluster[kkk];
}
// ensure pCluster sums to one, it doesn't automatically due to very small numbers getting capped
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] /= sumPK;
}
/*
// Clean up extra big or extra small clusters
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
System.out.println("!! Found very large cluster! Busting it up into smaller clusters.");
final int numToReplace = 3;
final Matrix savedSigma = sigma[kkk];
for( int rrr = 0; rrr < numToReplace; rrr++ ) {
// Find an example variant in the large cluster, drawn randomly
int randVarIndex = -1;
boolean foundVar = false;
while( !foundVar ) {
randVarIndex = rand.nextInt( numVariants );
final double probK = pVarInCluster[kkk][randVarIndex];
boolean inClusterK = true;
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
if( pVarInCluster[ccc][randVarIndex] > probK ) {
inClusterK = false;
break;
}
}
if( inClusterK ) { foundVar = true; }
}
// Find a place to put the example variant
if( rrr == 0 ) { // Replace the big cluster that kicked this process off
mu[kkk] = data[randVarIndex].annotations;
pCluster[kkk] = 1.0 / ((double) (stopCluster-startCluster));
} else { // Replace the cluster with the minimum prob
double minProb = pCluster[startCluster];
int minClusterIndex = startCluster;
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
if( pCluster[ccc] < minProb ) {
minProb = pCluster[ccc];
minClusterIndex = ccc;
}
}
mu[minClusterIndex] = data[randVarIndex].annotations;
sigma[minClusterIndex] = savedSigma;
//for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
// for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
// sigma[minClusterIndex].set(jjj, ppp, sigma[minClusterIndex].get(jjj, ppp) - 0.06 + 0.12 * rand.nextDouble());
// }
//}
pCluster[minClusterIndex] = 0.5 / ((double) (stopCluster-startCluster));
}
}
}
}
*/
/*
// Replace extremely small clusters with another random draw from the dataset
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) ||
determinant[kkk] < MIN_DETERMINANT ) { // This is a very small cluster compared to all the others
logger.info("!! Found singular cluster! Initializing a new random cluster.");
pCluster[kkk] = 0.1 / ((double) (stopCluster-startCluster));
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[][] randSigma = new double[numAnnotations][numAnnotations];
if( FORCE_INDEPENDENT_ANNOTATIONS ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
randSigma[ppp][jjj] = 0.50 + 0.5 * rand.nextDouble(); // data is normalized so this is centered at 1.0
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
if(jjj!=ppp) {
sigmaVals[kkk][jjj][ppp] = 0.0;
}
}
}
Matrix tmp = new Matrix(randSigma);
tmp = tmp.times(tmp.transpose());
sigma[kkk] = tmp;
determinant[kkk] = sigma[kkk].det();
}
final Matrix tmpMatrix = new Matrix(sigmaVals[kkk]);
if( tmpMatrix.det() > MIN_DETERMINANT ) {
sigma[kkk] = new Matrix(sigmaVals[kkk]);
determinant[kkk] = sigma[kkk].det();
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = meanVals[kkk][jjj];
}
} else {
logger.warn("Tried to create a covariance matrix with exceptionally small determinant.");
}
pClusterLog10[kkk] = sumProb;
sumPK += sumProb;
}
// renormalize pCluster since things might have changed due to the previous step
sumPK = 0.0;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
sumPK += pCluster[kkk];
pClusterLog10[kkk] = Math.log10( pClusterLog10[kkk] / sumPK );
}
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] /= sumPK;
}
*/
pClusterLog10 = MathUtils.normalizeFromLog10( pClusterLog10, true );
}
}

View File

@ -64,7 +64,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.0;
private double BACKOFF_FACTOR = 1.4;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
@ -76,7 +76,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private int NOVEL_QUAL_PRIOR = 2;
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
private double QUALITY_SCALE_FACTOR = 50.0;
private double QUALITY_SCALE_FACTOR = 200.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
@ -85,16 +85,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private Double[] FDR_TRANCHES = null;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
private double QUAL_STEP = 0.1;
// TODO: RYAN - remove me, even though this switch is apparently super awesome
private final static boolean AARONS_SUPER_AWESOME_SWITCH = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -136,27 +133,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
final TreeSet<String> samples = new TreeSet<String>();
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
if (AARONS_SUPER_AWESOME_SWITCH) {
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit()));
} else {
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getRecordType().equals(VCFRecord.class) ) {
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
reader.close();
}
}
}
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
@ -209,11 +190,11 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
variantDatum.isKnown = dbsnp != null;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
final double knownPrior = ( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
final double pTrue = theModel.evaluateVariant( vc ) * acPrior * knownPrior;
final double acPriorLog10 = Math.log10(theModel.getAlleleCountPrior( variantDatum.alleleCount ));
final double knownPriorLog10 = Math.log10( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
final double pTrue = Math.pow(10.0, theModel.evaluateVariantLog10( vc ) + acPriorLog10 + knownPriorLog10);
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
mapList.add( variantDatum );
@ -260,7 +241,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
logger.info( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {

View File

@ -50,8 +50,8 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
/////////////////////////////
@Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false)
private String OUTPUT_PREFIX = "analyzeAnnotations/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false)

View File

@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b39bde091254e90490b0d7bf9f0ef24a" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "9b7517ff1fd0fc23a2596acc82e2ed96" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -25,7 +25,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -B input,VCF," + vcf +
" -L 1:1-100,000,000" +
" -nG 6" +
" -nI 5" +
" --ignore_filter GATK_STANDARD" +
" -an QD -an HRun -an SB" +
" -clusterFile %s",
@ -39,7 +38,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "cf8dac86df97d55f866dd379434bbdc2" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e0e3d959929aa3940a81c9926d1406e2" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();