Fixing non-determinism in single-threaded VQSR by moving references to cern.Normal over to the static random generator available in GenomeAnalysisEngine

This commit is contained in:
Ryan Poplin 2011-07-07 15:02:48 -04:00
parent 54121eb082
commit 50111db2b7
2 changed files with 14 additions and 13 deletions

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import Jama.Matrix;
import cern.jet.random.Normal;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.MathUtils;
@ -97,7 +96,7 @@ public class GaussianMixtureModel {
int ttt = 0;
while( ttt++ < numIterations ) {
// Estep: assign each variant to the nearest cluster
// E step: assign each variant to the nearest cluster
for( final VariantDatum datum : data ) {
double minDistance = Double.MAX_VALUE;
MultivariateGaussian minGaussian = null;
@ -112,7 +111,7 @@ public class GaussianMixtureModel {
datum.assignment = minGaussian;
}
// Mstep: update gaussian means based on assigned variants
// M step: update gaussian means based on assigned variants
for( final MultivariateGaussian gaussian : gaussians ) {
gaussian.zeroOutMu();
int numAssigned = 0;
@ -216,26 +215,29 @@ public class GaussianMixtureModel {
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numVals = 0;
int numSamples = 0;
double sumPVarInGaussian = 0.0;
int numIter = 10;
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
// for each dimension
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
if( datum.isNull[iii] ) {
for( int ttt = 0; ttt < numIter; ttt++ ) {
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
// evaluate this random data point
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
}
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
numVals++;
// add this sample's probability to the pile in order to take an average in the end
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
numSamples++;
}
}
}
return Math.log10( sumPVarInGaussian / ((double) numVals) );
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
}
}

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import cern.jet.random.Normal;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -91,7 +90,7 @@ public class VariantDataManager {
meanVector[iii] = theMean;
varianceVector[iii] = theSTD;
for( final VariantDatum datum : data ) {
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
// Each data point is now [ (x - mean) / standard deviation ]
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
datum.annotations[iii] /= 3.0;