Ensuring a minimum number of variants when clustering with bad variants. Better error message when Matrix library fails to calculate inverse.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5793 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-05-12 01:48:37 +00:00
parent a20d257773
commit 40797f9d45
6 changed files with 27 additions and 8 deletions

View File

@ -4,6 +4,7 @@ import Jama.Matrix;
import org.apache.commons.math.special.Gamma;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays;
import java.util.List;
@ -88,14 +89,24 @@ public class MultivariateGaussian {
}
}
private void precomputeInverse() {
try {
cachedSigmaInverse = sigma.inverse();
} catch( Exception e ) {
throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling.");
}
}
public void precomputeDenominatorForEvaluation() {
cachedSigmaInverse = sigma.inverse();
precomputeInverse();
cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ;
}
public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
// Variational Bayes calculations from Bishop
cachedSigmaInverse = sigma.inverse();
precomputeInverse();
cachedSigmaInverse.timesEquals( hyperParameter_a );
double sum = 0.0;
for(int jjj = 1; jjj <= mu.length; jjj++) {

View File

@ -118,13 +118,21 @@ public class VariantDataManager {
}
}
logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
if( trainingData.size() < )
return trainingData;
}
public ExpandingArrayList<VariantDatum> selectWorstVariants( final double bottomPercentage ) {
public ExpandingArrayList<VariantDatum> selectWorstVariants( double bottomPercentage, final int minimumNumber ) {
Collections.sort( data );
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
final int numToAdd = Math.round((float)bottomPercentage * data.size());
final int numToAdd = Math.max( minimumNumber, Math.round((float)bottomPercentage * data.size()) );
if( numToAdd > data.size() ) {
throw new UserException.BadInput("Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe.");
}
if( numToAdd == minimumNumber ) {
logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable.");
bottomPercentage = ((float) numToAdd) / ((float) data.size());
}
int index = 0;
int numAdded = 0;
while( numAdded < numToAdd ) {

View File

@ -225,7 +225,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation
final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData() );
engine.evaluateData( dataManager.getData(), goodModel, false );
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS ) );
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
engine.evaluateData( dataManager.getData(), badModel, true );
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );

View File

@ -36,4 +36,6 @@ public class VariantRecalibratorArgumentCollection {
public double PRIOR_COUNTS = 20.0;
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.015;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000;
}

View File

@ -32,9 +32,6 @@ public class VariantRecalibratorEngine {
}
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
if( data.size() < 1500 ) {
logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable.");
}
final GaussianMixtureModel model = new GaussianMixtureModel( VRAC.MAX_GAUSSIANS, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS );
variationalBayesExpectationMaximization( model, data );
return model;

View File

@ -48,6 +48,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -L 20:1,000,000-40,000,000" +
" -an QD -an HaplotypeScore -an HRun" +
" -percentBad 0.07" +
" --minNumBadVariants 0" +
" --trustAllPolymorphic" + // for speed
" -recalFile %s" +
" -tranchesFile %s",