VQSR will now detect if the negative model failed to converge properly because of having too few data points and automatically retry with more appropriate clustering parameters.
This commit is contained in:
parent
59945a41e8
commit
40fbeafa37
|
|
@ -52,6 +52,7 @@ public class GaussianMixtureModel {
|
|||
private final double[] empiricalMu;
|
||||
private final Matrix empiricalSigma;
|
||||
public boolean isModelReadyForEvaluation;
|
||||
public boolean failedToConverge = false;
|
||||
|
||||
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
|
||||
final double shrinkage, final double dirichletParameter, final double priorCounts ) {
|
||||
|
|
|
|||
|
|
@ -309,8 +309,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
|
||||
// Generate the negative model using the worst performing data and evaluate each variant contrastively
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
|
||||
final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS );
|
||||
GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData );
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
|
||||
// Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again
|
||||
while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) {
|
||||
logger.info("Negative model failed to converge. Retrying...");
|
||||
VRAC.MAX_GAUSSIANS--;
|
||||
badModel = engine.generateModel( negativeTrainingData );
|
||||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
}
|
||||
|
||||
if( badModel.failedToConverge || goodModel.failedToConverge ) {
|
||||
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
|
||||
}
|
||||
|
||||
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
||||
|
||||
// Find the VQSLOD cutoff values which correspond to the various tranches of calls requested by the user
|
||||
|
|
|
|||
|
|
@ -67,14 +67,20 @@ public class VariantRecalibratorEngine {
|
|||
|
||||
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
|
||||
if( !model.isModelReadyForEvaluation ) {
|
||||
model.precomputeDenominatorForEvaluation();
|
||||
try {
|
||||
model.precomputeDenominatorForEvaluation();
|
||||
} catch( Exception e ) {
|
||||
model.failedToConverge = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Evaluating full set of " + data.size() + " variants...");
|
||||
for( final VariantDatum datum : data ) {
|
||||
final double thisLod = evaluateDatum( datum, model );
|
||||
if( Double.isNaN(thisLod) ) {
|
||||
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
|
||||
model.failedToConverge = true;
|
||||
return;
|
||||
}
|
||||
|
||||
datum.lod = ( evaluateContrastively ?
|
||||
|
|
|
|||
Loading…
Reference in New Issue