Fixing non-determinism in single-threaded VQSR by moving references to cern.Normal over to the static random generator available in GenomeAnalysisEngine
This commit is contained in:
parent
54121eb082
commit
50111db2b7
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import Jama.Matrix;
|
||||
import cern.jet.random.Normal;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -97,7 +96,7 @@ public class GaussianMixtureModel {
|
|||
|
||||
int ttt = 0;
|
||||
while( ttt++ < numIterations ) {
|
||||
// Estep: assign each variant to the nearest cluster
|
||||
// E step: assign each variant to the nearest cluster
|
||||
for( final VariantDatum datum : data ) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
MultivariateGaussian minGaussian = null;
|
||||
|
|
@ -112,7 +111,7 @@ public class GaussianMixtureModel {
|
|||
datum.assignment = minGaussian;
|
||||
}
|
||||
|
||||
// Mstep: update gaussian means based on assigned variants
|
||||
// M step: update gaussian means based on assigned variants
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.zeroOutMu();
|
||||
int numAssigned = 0;
|
||||
|
|
@ -216,26 +215,29 @@ public class GaussianMixtureModel {
|
|||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numVals = 0;
|
||||
int numSamples = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
int numIter = 10;
|
||||
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
// for each dimension
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||
if( datum.isNull[iii] ) {
|
||||
for( int ttt = 0; ttt < numIter; ttt++ ) {
|
||||
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
|
||||
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
|
||||
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
|
||||
|
||||
// evaluate this random data point
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
|
||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
|
||||
numVals++;
|
||||
// add this sample's probability to the pile in order to take an average in the end
|
||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
|
||||
numSamples++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Math.log10( sumPVarInGaussian / ((double) numVals) );
|
||||
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import cern.jet.random.Normal;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -91,7 +90,7 @@ public class VariantDataManager {
|
|||
meanVector[iii] = theMean;
|
||||
varianceVector[iii] = theSTD;
|
||||
for( final VariantDatum datum : data ) {
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
// Each data point is now [ (x - mean) / standard deviation ]
|
||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
||||
datum.annotations[iii] /= 3.0;
|
||||
|
|
|
|||
Loading…
Reference in New Issue