Ensuring a minimum number of variants when clustering with bad variants. Better error message when Matrix library fails to calculate inverse.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5793 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a20d257773
commit
40797f9d45
|
|
@ -4,6 +4,7 @@ import Jama.Matrix;
|
|||
import org.apache.commons.math.special.Gamma;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
|
@ -88,14 +89,24 @@ public class MultivariateGaussian {
|
|||
}
|
||||
}
|
||||
|
||||
private void precomputeInverse() {
|
||||
try {
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
} catch( Exception e ) {
|
||||
throw new UserException("Error during clustering. Most likely there are too few variants used during Gaussian mixture modeling.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void precomputeDenominatorForEvaluation() {
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
precomputeInverse();
|
||||
cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ;
|
||||
}
|
||||
|
||||
public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
|
||||
|
||||
// Variational Bayes calculations from Bishop
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
precomputeInverse();
|
||||
cachedSigmaInverse.timesEquals( hyperParameter_a );
|
||||
double sum = 0.0;
|
||||
for(int jjj = 1; jjj <= mu.length; jjj++) {
|
||||
|
|
|
|||
|
|
@ -118,13 +118,21 @@ public class VariantDataManager {
|
|||
}
|
||||
}
|
||||
logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
|
||||
if( trainingData.size() < )
|
||||
return trainingData;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> selectWorstVariants( final double bottomPercentage ) {
|
||||
public ExpandingArrayList<VariantDatum> selectWorstVariants( double bottomPercentage, final int minimumNumber ) {
|
||||
Collections.sort( data );
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
|
||||
final int numToAdd = Math.round((float)bottomPercentage * data.size());
|
||||
final int numToAdd = Math.max( minimumNumber, Math.round((float)bottomPercentage * data.size()) );
|
||||
if( numToAdd > data.size() ) {
|
||||
throw new UserException.BadInput("Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe.");
|
||||
}
|
||||
if( numToAdd == minimumNumber ) {
|
||||
logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable.");
|
||||
bottomPercentage = ((float) numToAdd) / ((float) data.size());
|
||||
}
|
||||
int index = 0;
|
||||
int numAdded = 0;
|
||||
while( numAdded < numToAdd ) {
|
||||
|
|
|
|||
|
|
@ -225,7 +225,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation
|
||||
final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData() );
|
||||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS ) );
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
|
||||
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );
|
||||
|
|
|
|||
|
|
@ -36,4 +36,6 @@ public class VariantRecalibratorArgumentCollection {
|
|||
public double PRIOR_COUNTS = 20.0;
|
||||
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
|
||||
public double PERCENT_BAD_VARIANTS = 0.015;
|
||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
|
||||
public int MIN_NUM_BAD_VARIANTS = 2000;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,9 +32,6 @@ public class VariantRecalibratorEngine {
|
|||
}
|
||||
|
||||
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
|
||||
if( data.size() < 1500 ) {
|
||||
logger.warn("WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable.");
|
||||
}
|
||||
final GaussianMixtureModel model = new GaussianMixtureModel( VRAC.MAX_GAUSSIANS, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS );
|
||||
variationalBayesExpectationMaximization( model, data );
|
||||
return model;
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -L 20:1,000,000-40,000,000" +
|
||||
" -an QD -an HaplotypeScore -an HRun" +
|
||||
" -percentBad 0.07" +
|
||||
" --minNumBadVariants 0" +
|
||||
" --trustAllPolymorphic" + // for speed
|
||||
" -recalFile %s" +
|
||||
" -tranchesFile %s",
|
||||
|
|
|
|||
Loading…
Reference in New Issue