diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java index 52402ad8a..87c614bda 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java @@ -4,6 +4,7 @@ import Jama.Matrix; import org.apache.commons.math.special.Gamma; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; import java.util.List; @@ -88,14 +89,24 @@ public class MultivariateGaussian { } } + private void precomputeInverse() { + try { + cachedSigmaInverse = sigma.inverse(); + } catch( Exception e ) { + throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling."); + } + } + + public void precomputeDenominatorForEvaluation() { - cachedSigmaInverse = sigma.inverse(); + precomputeInverse(); cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ; } public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) { + // Variational Bayes calculations from Bishop - cachedSigmaInverse = sigma.inverse(); + precomputeInverse(); cachedSigmaInverse.timesEquals( hyperParameter_a ); double sum = 0.0; for(int jjj = 1; jjj <= mu.length; jjj++) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 3c4985d92..c523a8337 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -118,13 +118,21 @@ public class VariantDataManager { } } logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." ); + if( trainingData.size() < ) return trainingData; } - public ExpandingArrayList selectWorstVariants( final double bottomPercentage ) { + public ExpandingArrayList selectWorstVariants( double bottomPercentage, final int minimumNumber ) { Collections.sort( data ); final ExpandingArrayList trainingData = new ExpandingArrayList(); - final int numToAdd = Math.round((float)bottomPercentage * data.size()); + final int numToAdd = Math.max( minimumNumber, Math.round((float)bottomPercentage * data.size()) ); + if( numToAdd > data.size() ) { + throw new UserException.BadInput("Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe."); + } + if( numToAdd == minimumNumber ) { + logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable."); + bottomPercentage = ((float) numToAdd) / ((float) data.size()); + } int index = 0; int numAdded = 0; while( numAdded < numToAdd ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index fb6da5ed3..85336afa4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -225,7 +225,7 @@ public class VariantRecalibrator extends RodWalker randomData = dataManager.getRandomDataForPlotting( 6000 ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java index 7a524f259..e177cebab 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorArgumentCollection.java @@ -36,4 +36,6 @@ public class VariantRecalibratorArgumentCollection { public double PRIOR_COUNTS = 20.0; @Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false) public double PERCENT_BAD_VARIANTS = 0.015; + @Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false) + public int MIN_NUM_BAD_VARIANTS = 2000; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index 21b7f1bdf..e87048a13 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -32,9 +32,6 @@ public class VariantRecalibratorEngine { } public GaussianMixtureModel generateModel( final List data ) { - if( data.size() < 1500 ) { - logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable."); - } final GaussianMixtureModel model = new GaussianMixtureModel( VRAC.MAX_GAUSSIANS, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); variationalBayesExpectationMaximization( model, data ); return model; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index b44615c6b..9267276fd 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -48,6 +48,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 20:1,000,000-40,000,000" + " -an QD -an HaplotypeScore -an HRun" + " -percentBad 0.07" + + " --minNumBadVariants 0" + " --trustAllPolymorphic" + // for speed " -recalFile %s" + " -tranchesFile %s",