From 724affc3cc7078455a295f2f1f76a4ec51af381a Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 18 Jun 2010 17:37:11 +0000 Subject: [PATCH] Major bug fixes for the Variant Recalibrator. Covariance matrix values are now allowed to be negative. When probabilities are multiplied together the calculation is done in log space, normalized, then converted back to real valued probabilities. Clustering weights have been changed to only use HapMap and by-1000genomes sites. The -nI argument was removed and now clustering simply runs until convergence. Test cases seem to work best when using just two annotations (QD and SB). More changes are in the works and are being evaluated. Misc fixes to walkers that use RScript due to CentOS changes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3590 348d0f76-0448-11de-a6fe-93d51630548a --- .../analyzecovariates/AnalyzeCovariates.java | 4 +- .../sting/gatk/GenomeAnalysisEngine.java | 2 +- .../GenerateVariantClustersWalker.java | 65 ++- .../variantrecalibration/VariantDatum.java | 7 + .../VariantGaussianMixtureModel.java | 369 ++++++++++-------- .../VariantRecalibrator.java | 45 +-- .../AnalyzeAnnotationsWalker.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 5 +- 8 files changed, 281 insertions(+), 220 deletions(-) diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index cfc65df79..2c6371d30 100755 --- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -55,8 +55,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram { private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) private String OUTPUT_DIR = "analyzeCovariates/"; - @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + private String PATH_TO_RSCRIPT = "Rscript"; @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) private String PATH_TO_RESOURCES = "R/"; @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 8b3d05675..652dda55e 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -644,7 +644,7 @@ public class GenomeAnalysisEngine { for (RMDTrack track : tracks) { SAMSequenceDictionary trackDict = track.getSequenceDictionary(); if (trackDict == null) { - logger.info("Track " + track.getName() + "doesn't have a sequence dictionary built in, skipping dictionary validation"); + logger.info("Track " + track.getName() + " doesn't have a sequence dictionary built in, skipping dictionary validation"); continue; } Set trackSequences = new TreeSet(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index 60e715cda..e0b20e931 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.commandline.Argument; import java.io.IOException; +import java.io.PrintStream; import java.util.*; /** @@ -65,23 +66,32 @@ public class GenerateVariantClustersWalker extends RodWalker annotationKeys; private Set ignoreInputFilterSet = null; private int maxAC = 0; + private PrintStream outFile; + private final static boolean FXYZ_FILE = false; // Debug argument //--------------------------------------------------------------------------------------------------------------- // @@ -122,6 +134,14 @@ public class GenerateVariantClustersWalker extends RodWalker 0.0)) { + goodVar = false; + numZeroWeight++; + } + if(goodVar) { + for( final double val : datum.annotations ) { + if( Math.abs(val) > stdThreshold ) { + goodVar = false; + numOutlier++; + break; + } + } + } + if(goodVar) { + if( datum.qual < qualThreshold ) { + goodVar = false; + numBadQual++; + } + } + if(goodVar) { numValid++; } + } + + final VariantDatum data[] = new VariantDatum[numValid]; + int iii = 0; + for( final VariantDatum datum : dataManager.data ) { + boolean goodVar = true; + if(!(datum.weight > 0.0)) { + goodVar = false; + } + if(goodVar) { + for( final double val : datum.annotations ) { + if( Math.abs(val) > stdThreshold ) { + goodVar = false; + break; + } + } + } + if(goodVar) { + if( datum.qual < qualThreshold ) { + goodVar = false; + } + } + if(goodVar) { data[iii++] = datum; } + } + + logger.info("Clustering with " + data.length + " valid variants."); + logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight."); + logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value."); + logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ")."); + createClusters( data, 0, numGaussians, clusterFileName ); + // Simply cluster with all the variants. The knowns have been given more weight than the novels - logger.info("Clustering with " + dataManager.data.length + " variants."); - createClusters( dataManager.data, 0, numGaussians, clusterFileName, false ); + //logger.info("Clustering with " + dataManager.data.length + " variants."); + //createClusters( dataManager.data, 0, numGaussians, clusterFileName ); } private void generateAlleleCountPrior() { @@ -169,13 +239,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution - alleleCount[iii] = 0; + alleleCount[iii] = 1; // Start off with one count to smooth the estimate } for( final VariantDatum datum : dataManager.data ) { alleleCount[datum.alleleCount]++; } for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { - acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution + acActual[iii] = ((double)alleleCount[iii]) / ((double) (dataManager.data.length+(alleleCountFactorArray.length-1))); // Turn acActual into a probability distribution } for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed) @@ -186,7 +256,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel return alleleCountFactorArray[alleleCount]; } - public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) { + public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) { final int numVariants = data.length; final int numAnnotations = data[0].annotations.length; @@ -202,15 +272,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // Set up the initial random Gaussians for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - pCluster[kkk] = 1.0 / ((double) (stopCluster - startCluster)); + pClusterLog10[kkk] = Math.log10(1.0 / ((double) (stopCluster - startCluster))); mu[kkk] = data[rand.nextInt(numVariants)].annotations; final double[][] randSigma = new double[numAnnotations][numAnnotations]; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int ppp = jjj; ppp < numAnnotations; ppp++ ) { - randSigma[ppp][jjj] = 0.5 + 0.5 * rand.nextDouble(); // data has been normalized so sigmas are centered at 1.0 - if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix + randSigma[ppp][jjj] = 0.55 + 1.25 * rand.nextDouble(); + if(rand.nextBoolean()) { + randSigma[ppp][jjj] *= -1.0; + } + if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying by its transpose } } + if( FORCE_INDEPENDENT_ANNOTATIONS ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + for( int ppp = 0; ppp < numAnnotations; ppp++ ) { + if(jjj!=ppp) { + randSigma[jjj][ppp] = 0.0; + } + } + } + + } Matrix tmp = new Matrix(randSigma); tmp = tmp.times(tmp.transpose()); sigma[kkk] = tmp; @@ -218,12 +301,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } // The EM loop - for( int ttt = 0; ttt < numIterations; ttt++ ) { + double previousLikelihood = -1E20; + double currentLikelihood; + int ttt = 1; + while( ttt < maxIterations ) { ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Expectation Step (calculate the probability that each data point is in each cluster) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - evaluateGaussians( data, pVarInCluster, startCluster, stopCluster ); + currentLikelihood = evaluateGaussians( data, pVarInCluster, startCluster, stopCluster ); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Maximization Step (move the clusters to maximize the sum probability of each data point) @@ -233,9 +319,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Output cluster parameters at each iteration ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - printClusterParameters( clusterFileName + "." + (ttt+1) ); + printClusterParameters( clusterFileName + "." + ttt ); - logger.info("Finished iteration " + (ttt+1) ); + logger.info("Finished iteration " + ttt ); + ttt++; + if( currentLikelihood - previousLikelihood < MIN_PROB_CONVERGENCE) { + logger.info("Convergence!"); + break; + } + previousLikelihood = currentLikelihood; } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -256,10 +348,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final int numAnnotations = mu[0].length; final int numVariants = dataManager.numVariants; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - if( pCluster[kkk] * numVariants > minVarInCluster ) { + if( Math.pow(10.0, pClusterLog10[kkk]) * numVariants > minVarInCluster ) { final double sigmaVals[][] = sigma[kkk].getArray(); outputFile.print("@!CLUSTER,"); - outputFile.print(pCluster[kkk] + ","); + outputFile.print(Math.pow(10.0, pClusterLog10[kkk]) + ","); for(int jjj = 0; jjj < numAnnotations; jjj++ ) { outputFile.print(mu[kkk][jjj] + ","); } @@ -295,7 +387,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel return value; } - public final double evaluateVariant( final VariantContext vc ) { + public final double evaluateVariantLog10( final VariantContext vc ) { final double[] pVarInCluster = new double[numGaussians]; final double[] annotations = new double[dataManager.numAnnotations]; @@ -310,7 +402,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for( int kkk = 0; kkk < numGaussians; kkk++ ) { sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk]; } - return sum; + + double sumLog10 = Math.log10(sum); + if( Double.isInfinite(sumLog10) || Double.isNaN(sumLog10) ) { + //logger.warn("pTrueLog10 = -Infinity, capped at -20"); + sumLog10 = -20; + } + return sumLog10; } public final void outputClusterReports( final String outputPrefix ) { @@ -330,7 +428,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } - for( VariantDatum datum : dataManager.data ) { + for( final VariantDatum datum : dataManager.data ) { final int isKnown = ( datum.isKnown ? 1 : 0 ); for( int jjj = 0; jjj < numAnnotations; jjj++ ) { int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP)); @@ -368,7 +466,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, - final int desiredNumVariants, final Double[] FDRtranches, double QUAL_STEP ) { + final int desiredNumVariants, final Double[] FDRtranches, final double QUAL_STEP ) { final int numVariants = data.length; final boolean[] markedVariant = new boolean[numVariants]; @@ -510,19 +608,27 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } - private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) { + private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) { final int numAnnotations = data[0].annotations.length; double likelihood = 0.0; final double sigmaVals[][][] = new double[numGaussians][][]; - final double denom[] = new double[numGaussians]; + final double denomLog10[] = new double[numGaussians]; + final double pVarInClusterLog10[] = new double[numGaussians]; + double pVarInClusterReals[]; + for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { sigmaVals[kkk] = sigma[kkk].inverse().getArray(); - denom[kkk] = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(Math.abs(determinant[kkk]), 0.5); + denomLog10[kkk] = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5)); + if( Double.isInfinite(denomLog10[kkk]) ) { + throw new StingException("Numerical Instability! Determinant value is too small: " + determinant[kkk] + + "Try running with fewer annotations and then with fewer Gaussians."); + } } final double mult[] = new double[numAnnotations]; + double sumWeight = 0.0; for( int iii = 0; iii < data.length; iii++ ) { - double sumProb = 0.0; + sumWeight += data[iii].weight; for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { double sum = 0.0; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { @@ -535,47 +641,40 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]); } - pVarInCluster[kkk][iii] = pCluster[kkk] * (Math.exp( -0.5 * sum ) / denom[kkk]); - likelihood += pVarInCluster[kkk][iii]; - if(Double.isNaN(denom[kkk]) || determinant[kkk] < 0.5 * MIN_DETERMINANT) { - System.out.println("det = " + sigma[kkk].det()); - System.out.println("denom = " + denom[kkk]); - System.out.println("sumExp = " + sum); - System.out.println("pVar = " + pVarInCluster[kkk][iii]); - System.out.println("=-------="); - throw new StingException("Numerical Instability! determinant of covariance matrix <= 0. Try running with fewer clusters and then with better behaved annotation values."); - } - if(sum < 0.0) { - System.out.println("det = " + sigma[kkk].det()); - System.out.println("denom = " + denom[kkk]); - System.out.println("sumExp = " + sum); - System.out.println("pVar = " + pVarInCluster[kkk][iii]); - System.out.println("=-------="); - throw new StingException("Numerical Instability! covariance matrix no longer positive definite. Try running with fewer clusters and then with better behaved annotation values."); - } - if(pVarInCluster[kkk][iii] > 1.0) { - System.out.println("det = " + sigma[kkk].det()); - System.out.println("denom = " + denom[kkk]); - System.out.println("sumExp = " + sum); - System.out.println("pVar = " + pVarInCluster[kkk][iii]); - System.out.println("=-------="); - throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values."); - } + pVarInClusterLog10[kkk] = pClusterLog10[kkk] + (( -0.5 * sum )/Math.log(10.0)) - denomLog10[kkk]; + final double pVar = Math.pow(10.0, pVarInClusterLog10[kkk]); + likelihood += pVar * data[iii].weight; - if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem - pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble(); - } + if( pVarInClusterLog10[kkk] > 0.0 || Double.isNaN(pVarInClusterLog10[kkk]) || Double.isInfinite(pVarInClusterLog10[kkk]) ) { + logger.warn("det = " + sigma[kkk].det()); + logger.warn("denom = " + denomLog10[kkk]); + logger.warn("sumExp = " + sum); + logger.warn("pVar = " + pVar); + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + for( int ppp = 0; ppp < numAnnotations; ppp++ ) { + logger.warn(sigmaVals[kkk][ppp][jjj]); + } + } - sumProb += pVarInCluster[kkk][iii]; + logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " + + "It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model."); + throw new StingException("Numerical Instability! Found NaN after performing log10: " + pVarInClusterLog10[kkk] + ", cluster = " + kkk + ", variant index = " + iii); + } } + pVarInClusterReals = MathUtils.normalizeFromLog10( pVarInClusterLog10 ); for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - pVarInCluster[kkk][iii] /= sumProb; - pVarInCluster[kkk][iii] *= data[iii].weight; + pVarInCluster[kkk][iii] = pVarInClusterReals[kkk] * data[iii].weight; + if( Double.isNaN(pVarInCluster[kkk][iii]) ) { + logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " + + "It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model."); + throw new StingException("Numerical Instability! Found NaN after rescaling log10 values: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii); + } } } - logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length)); + logger.info("Explained likelihood = " + String.format("%.5f",likelihood / sumWeight)); + return likelihood / sumWeight; } @@ -596,8 +695,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]); } - final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5); - pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom; + final double denomLog10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5)); + pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + (( -0.5 * sum ) / Math.log(10.0)) - denomLog10); } } @@ -607,10 +706,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final int numVariants = data.length; final int numAnnotations = data[0].annotations.length; final double sigmaVals[][][] = new double[numGaussians][numAnnotations][numAnnotations]; + final double meanVals[][] = new double[numGaussians][numAnnotations]; for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] = 0.0; + meanVals[kkk][jjj] = 0.0; for( int ppp = 0; ppp < numAnnotations; ppp++ ) { sigmaVals[kkk][jjj][ppp] = 0.0; } @@ -621,29 +721,34 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel double sumProb = 0.0; for( int iii = 0; iii < numVariants; iii++ ) { final double prob = pVarInCluster[kkk][iii]; + if( Double.isNaN(prob) ) { + logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " + + "It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model."); + throw new StingException("Numerical Instability! Found NaN in M-step: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii); + } sumProb += prob; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] += prob * data[iii].annotations[jjj]; + meanVals[kkk][jjj] += prob * data[iii].annotations[jjj]; } } for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] /= sumProb; + meanVals[kkk][jjj] /= sumProb; } for( int iii = 0; iii < numVariants; iii++ ) { final double prob = pVarInCluster[kkk][iii]; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int ppp = jjj; ppp < numAnnotations; ppp++ ) { - sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[ppp]-mu[kkk][ppp]); + sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-meanVals[kkk][jjj]) * (data[iii].annotations[ppp]-meanVals[kkk][ppp]); } } } for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int ppp = jjj; ppp < numAnnotations; ppp++ ) { - if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA ) { // Very small numbers are a very big problem - sigmaVals[kkk][jjj][ppp] = MIN_SIGMA;// + MIN_SIGMA * rand.nextDouble(); + if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA && sigmaVals[kkk][jjj][ppp] > -MIN_SIGMA ) { // Very small numbers are a very big problem + logger.warn("The sigma values look exceptionally small.... Probably about to crash due to numeric instability."); } sigmaVals[kkk][ppp][jjj] = sigmaVals[kkk][jjj][ppp]; // sigma must be a symmetric matrix } @@ -651,103 +756,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int ppp = 0; ppp < numAnnotations; ppp++ ) { - sigmaVals[kkk][jjj][ppp] /= sumProb; + sigmaVals[kkk][jjj][ppp] /= sumProb; } } - sigma[kkk] = new Matrix(sigmaVals[kkk]); - determinant[kkk] = sigma[kkk].det(); - - pCluster[kkk] = sumProb / numVariants; - sumPK += pCluster[kkk]; - } - - // ensure pCluster sums to one, it doesn't automatically due to very small numbers getting capped - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - pCluster[kkk] /= sumPK; - } - - /* - // Clean up extra big or extra small clusters - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others - System.out.println("!! Found very large cluster! Busting it up into smaller clusters."); - final int numToReplace = 3; - final Matrix savedSigma = sigma[kkk]; - for( int rrr = 0; rrr < numToReplace; rrr++ ) { - // Find an example variant in the large cluster, drawn randomly - int randVarIndex = -1; - boolean foundVar = false; - while( !foundVar ) { - randVarIndex = rand.nextInt( numVariants ); - final double probK = pVarInCluster[kkk][randVarIndex]; - boolean inClusterK = true; - for( int ccc = startCluster; ccc < stopCluster; ccc++ ) { - if( pVarInCluster[ccc][randVarIndex] > probK ) { - inClusterK = false; - break; - } - } - if( inClusterK ) { foundVar = true; } - } - - // Find a place to put the example variant - if( rrr == 0 ) { // Replace the big cluster that kicked this process off - mu[kkk] = data[randVarIndex].annotations; - pCluster[kkk] = 1.0 / ((double) (stopCluster-startCluster)); - } else { // Replace the cluster with the minimum prob - double minProb = pCluster[startCluster]; - int minClusterIndex = startCluster; - for( int ccc = startCluster; ccc < stopCluster; ccc++ ) { - if( pCluster[ccc] < minProb ) { - minProb = pCluster[ccc]; - minClusterIndex = ccc; - } - } - mu[minClusterIndex] = data[randVarIndex].annotations; - sigma[minClusterIndex] = savedSigma; - //for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - // for( int ppp = 0; ppp < numAnnotations; ppp++ ) { - // sigma[minClusterIndex].set(jjj, ppp, sigma[minClusterIndex].get(jjj, ppp) - 0.06 + 0.12 * rand.nextDouble()); - // } - //} - pCluster[minClusterIndex] = 0.5 / ((double) (stopCluster-startCluster)); - } - } - } - } - */ - - /* - // Replace extremely small clusters with another random draw from the dataset - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) || - determinant[kkk] < MIN_DETERMINANT ) { // This is a very small cluster compared to all the others - logger.info("!! Found singular cluster! Initializing a new random cluster."); - pCluster[kkk] = 0.1 / ((double) (stopCluster-startCluster)); - mu[kkk] = data[rand.nextInt(numVariants)].annotations; - final double[][] randSigma = new double[numAnnotations][numAnnotations]; + if( FORCE_INDEPENDENT_ANNOTATIONS ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - for( int ppp = jjj; ppp < numAnnotations; ppp++ ) { - randSigma[ppp][jjj] = 0.50 + 0.5 * rand.nextDouble(); // data is normalized so this is centered at 1.0 - if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix + for( int ppp = 0; ppp < numAnnotations; ppp++ ) { + if(jjj!=ppp) { + sigmaVals[kkk][jjj][ppp] = 0.0; + } } } - Matrix tmp = new Matrix(randSigma); - tmp = tmp.times(tmp.transpose()); - sigma[kkk] = tmp; - determinant[kkk] = sigma[kkk].det(); + } + + final Matrix tmpMatrix = new Matrix(sigmaVals[kkk]); + if( tmpMatrix.det() > MIN_DETERMINANT ) { + sigma[kkk] = new Matrix(sigmaVals[kkk]); + determinant[kkk] = sigma[kkk].det(); + + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] = meanVals[kkk][jjj]; + } + } else { + logger.warn("Tried to create a covariance matrix with exceptionally small determinant."); + } + + pClusterLog10[kkk] = sumProb; + sumPK += sumProb; } - // renormalize pCluster since things might have changed due to the previous step - sumPK = 0.0; for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - sumPK += pCluster[kkk]; + pClusterLog10[kkk] = Math.log10( pClusterLog10[kkk] / sumPK ); } - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - pCluster[kkk] /= sumPK; - } - */ + + pClusterLog10 = MathUtils.normalizeFromLog10( pClusterLog10, true ); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 75ce0d4df..231d2349b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -64,7 +64,7 @@ public class VariantRecalibrator extends RodWalker hInfo = new HashSet(); final TreeSet samples = new TreeSet(); final List dataSources = this.getToolkit().getRodDataSources(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score")); + hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); + samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit())); - if (AARONS_SUPER_AWESOME_SWITCH) { - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score")); - hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); - samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit())); - - } else { - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score")); - hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); - for( final ReferenceOrderedDataSource source : dataSources ) { - final RMDTrack rod = source.getReferenceOrderedData(); - if( rod.getRecordType().equals(VCFRecord.class) ) { - final VCFReader reader = new VCFReader(rod.getFile()); - final Set vcfSamples = reader.getHeader().getGenotypeSamples(); - samples.addAll(vcfSamples); - reader.close(); - } - } - } vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") ); final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); @@ -209,11 +190,11 @@ public class VariantRecalibrator extends RodWalker { ///////////////////////////// @Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false) private String OUTPUT_PREFIX = "analyzeAnnotations/"; - @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + private String PATH_TO_RSCRIPT = "Rscript"; @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) private String PATH_TO_RESOURCES = "R/"; @Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false) diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 7a67bf8ce..05eefcc13 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testGenerateVariantClusters() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b39bde091254e90490b0d7bf9f0ef24a" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "9b7517ff1fd0fc23a2596acc82e2ed96" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -25,7 +25,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -B input,VCF," + vcf + " -L 1:1-100,000,000" + " -nG 6" + - " -nI 5" + " --ignore_filter GATK_STANDARD" + " -an QD -an HRun -an SB" + " -clusterFile %s", @@ -39,7 +38,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testVariantRecalibrator() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "cf8dac86df97d55f866dd379434bbdc2" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e0e3d959929aa3940a81c9926d1406e2" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey();