misc cleanup in VQSR
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5732 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f3bd11a02e
commit
6323fb8673
|
|
@ -210,7 +210,7 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
|
|||
|
||||
public void onTraversalDone( final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
dataManager.setData( reduceSum );
|
||||
dataManager.normalizeData();
|
||||
dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation
|
||||
final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData() );
|
||||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS ) );
|
||||
|
|
@ -250,12 +250,10 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
|
|||
|
||||
createArrangeFunction( stream );
|
||||
|
||||
stream.println("pdf(\"" + RSCRIPT_FILE + ".pdf\")");
|
||||
stream.println("pdf(\"" + RSCRIPT_FILE + ".pdf\")"); // Unfortunately this is a huge pdf file, BUGBUG: need to work on reducing the file size
|
||||
|
||||
for(int iii = 0; iii < USE_ANNOTATIONS.length; iii++) {
|
||||
for( int jjj = iii + 1; jjj < USE_ANNOTATIONS.length; jjj++) {
|
||||
//stream.println("png(\"" + RSCRIPT_FILE + "." + USE_ANNOTATIONS[iii] + "." + USE_ANNOTATIONS[jjj] + ".png\", type=\"cairo\", width = 960, height = 960)");
|
||||
//stream.println("pdf(\"" + RSCRIPT_FILE + "." + USE_ANNOTATIONS[iii] + "." + USE_ANNOTATIONS[jjj] + ".pdf\")");
|
||||
logger.info( "Building " + USE_ANNOTATIONS[iii] + " x " + USE_ANNOTATIONS[jjj] + " plot...");
|
||||
|
||||
final ExpandingArrayList<VariantDatum> fakeData = new ExpandingArrayList<VariantDatum>();
|
||||
|
|
@ -266,6 +264,7 @@ public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<Varian
|
|||
minAnn2 = Math.min(minAnn2, datum.annotations[jjj]);
|
||||
maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]);
|
||||
}
|
||||
// Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D
|
||||
for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+=0.1) {
|
||||
for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+=0.1) {
|
||||
final VariantDatum datum = new VariantDatum();
|
||||
|
|
|
|||
|
|
@ -42,9 +42,6 @@ public class GaussianMixtureModel {
|
|||
empiricalMu = new double[numAnnotations];
|
||||
empiricalSigma = new Matrix(numAnnotations, numAnnotations);
|
||||
isModelReadyForEvaluation = false;
|
||||
}
|
||||
|
||||
public void cacheEmpiricalStats() {
|
||||
Arrays.fill(empiricalMu, 0.0);
|
||||
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, Matrix.identity(empiricalMu.length, empiricalMu.length).times(200.0).inverse());
|
||||
}
|
||||
|
|
@ -75,6 +72,7 @@ public class GaussianMixtureModel {
|
|||
|
||||
int ttt = 0;
|
||||
while( ttt++ < numIterations ) {
|
||||
// Estep: assign each variant to the nearest cluster
|
||||
for( final VariantDatum datum : data ) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
MultivariateGaussian minGaussian = null;
|
||||
|
|
@ -89,6 +87,7 @@ public class GaussianMixtureModel {
|
|||
datum.assignment = minGaussian;
|
||||
}
|
||||
|
||||
// Mstep: update gaussian means based on assigned variants
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.zeroOutMu();
|
||||
int numAssigned = 0;
|
||||
|
|
@ -188,7 +187,7 @@ public class GaussianMixtureModel {
|
|||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10);
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
|
|
@ -197,6 +196,7 @@ public class GaussianMixtureModel {
|
|||
int numIter = 10;
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||
if( datum.isNull[iii] ) {
|
||||
for( int ttt = 0; ttt < numIter; ttt++ ) {
|
||||
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
|
||||
|
|
|
|||
|
|
@ -94,6 +94,7 @@ public class MultivariateGaussian {
|
|||
}
|
||||
|
||||
public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
|
||||
// Variational Bayes calculations from Bishop
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
cachedSigmaInverse.timesEquals( hyperParameter_a );
|
||||
double sum = 0.0;
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ public class VariantDataManager {
|
|||
throw new UserException.BadInput( "Found annotations with zero variance. They must be excluded before proceeding." );
|
||||
}
|
||||
|
||||
// trim data by standard deviation threshold and place into two sets: data and failingData
|
||||
// trim data by standard deviation threshold and mark failing data for exclusion later
|
||||
for( final VariantDatum datum : data ) {
|
||||
boolean remove = false;
|
||||
for( final double val : datum.annotations ) {
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ public class VariantRecalibratorEngine {
|
|||
|
||||
public VariantRecalibratorEngine( final VariantRecalibratorArgumentCollection VRAC ) {
|
||||
this.VRAC = VRAC;
|
||||
initialize( this.VRAC );
|
||||
}
|
||||
|
||||
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
|
||||
|
|
@ -50,36 +49,28 @@ public class VariantRecalibratorEngine {
|
|||
}
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for initialization
|
||||
/////////////////////////////
|
||||
|
||||
private void initialize( final VariantRecalibratorArgumentCollection VRAC ) {
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for generating a GaussianMixtureModel
|
||||
/////////////////////////////
|
||||
|
||||
private void variationalBayesExpectationMaximization( final GaussianMixtureModel model, final List<VariantDatum> data ) {
|
||||
|
||||
model.cacheEmpiricalStats();
|
||||
model.initializeRandomModel( data, VRAC.NUM_KMEANS_ITERATIONS );
|
||||
|
||||
// The VBEM loop
|
||||
model.normalizePMixtureLog10();
|
||||
model.expectationStep( data );
|
||||
double currentLikelihood;
|
||||
double currentChangeInMixtureCoefficients;
|
||||
int iteration = 0;
|
||||
logger.info("Finished iteration " + iteration );
|
||||
while( iteration < VRAC.MAX_ITERATIONS ) {
|
||||
iteration++;
|
||||
model.maximizationStep( data );
|
||||
currentLikelihood = model.normalizePMixtureLog10();
|
||||
model.expectationStep( data );
|
||||
logger.info("Current change in mixture coefficients = " + String.format("%.5f", currentLikelihood));
|
||||
currentChangeInMixtureCoefficients = model.normalizePMixtureLog10();
|
||||
model.expectationStep(data);
|
||||
logger.info("Current change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients));
|
||||
logger.info("Finished iteration " + iteration );
|
||||
if( iteration > 2 && currentLikelihood < MIN_PROB_CONVERGENCE ) {
|
||||
if( iteration > 2 && currentChangeInMixtureCoefficients < MIN_PROB_CONVERGENCE ) {
|
||||
logger.info("Convergence!");
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue