VQSR will now detect if the negative model failed to converge properly because of having too few data points and automatically retry with more appropriate clustering parameters.

This commit is contained in:
Ryan Poplin 2011-11-11 11:52:30 -05:00
parent 59945a41e8
commit 40fbeafa37
3 changed files with 25 additions and 3 deletions

View File

@ -52,6 +52,7 @@ public class GaussianMixtureModel {
private final double[] empiricalMu;
private final Matrix empiricalSigma;
public boolean isModelReadyForEvaluation;
public boolean failedToConverge = false;
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
final double shrinkage, final double dirichletParameter, final double priorCounts ) {

View File

@ -309,8 +309,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
engine.evaluateData( dataManager.getData(), goodModel, false );
// Generate the negative model using the worst performing data and evaluate each variant contrastively
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS );
GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData );
engine.evaluateData( dataManager.getData(), badModel, true );
// Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again
while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) {
logger.info("Negative model failed to converge. Retrying...");
VRAC.MAX_GAUSSIANS--;
badModel = engine.generateModel( negativeTrainingData );
engine.evaluateData( dataManager.getData(), goodModel, false );
engine.evaluateData( dataManager.getData(), badModel, true );
}
if( badModel.failedToConverge || goodModel.failedToConverge ) {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
}
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
// Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user

View File

@ -67,14 +67,20 @@ public class VariantRecalibratorEngine {
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
if( !model.isModelReadyForEvaluation ) {
model.precomputeDenominatorForEvaluation();
try {
model.precomputeDenominatorForEvaluation();
} catch( Exception e ) {
model.failedToConverge = true;
return;
}
}
logger.info("Evaluating full set of " + data.size() + " variants...");
for( final VariantDatum datum : data ) {
final double thisLod = evaluateDatum( datum, model );
if( Double.isNaN(thisLod) ) {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
model.failedToConverge = true;
return;
}
datum.lod = ( evaluateContrastively ?