Merge branch 'master' of ssh://copper.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/stable
This commit is contained in:
commit
ce5a1c2cbe
|
|
@ -26,7 +26,6 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||||
|
|
||||||
import Jama.Matrix;
|
import Jama.Matrix;
|
||||||
import cern.jet.random.Normal;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
|
@ -97,7 +96,7 @@ public class GaussianMixtureModel {
|
||||||
|
|
||||||
int ttt = 0;
|
int ttt = 0;
|
||||||
while( ttt++ < numIterations ) {
|
while( ttt++ < numIterations ) {
|
||||||
// Estep: assign each variant to the nearest cluster
|
// E step: assign each variant to the nearest cluster
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
double minDistance = Double.MAX_VALUE;
|
double minDistance = Double.MAX_VALUE;
|
||||||
MultivariateGaussian minGaussian = null;
|
MultivariateGaussian minGaussian = null;
|
||||||
|
|
@ -112,7 +111,7 @@ public class GaussianMixtureModel {
|
||||||
datum.assignment = minGaussian;
|
datum.assignment = minGaussian;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mstep: update gaussian means based on assigned variants
|
// M step: update gaussian means based on assigned variants
|
||||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||||
gaussian.zeroOutMu();
|
gaussian.zeroOutMu();
|
||||||
int numAssigned = 0;
|
int numAssigned = 0;
|
||||||
|
|
@ -216,26 +215,29 @@ public class GaussianMixtureModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||||
int numVals = 0;
|
int numSamples = 0;
|
||||||
double sumPVarInGaussian = 0.0;
|
double sumPVarInGaussian = 0.0;
|
||||||
int numIter = 10;
|
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||||
|
// for each dimension
|
||||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||||
// marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
// if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod
|
||||||
if( datum.isNull[iii] ) {
|
if( datum.isNull[iii] ) {
|
||||||
for( int ttt = 0; ttt < numIter; ttt++ ) {
|
for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) {
|
||||||
datum.annotations[iii] = Normal.staticNextDouble(0.0, 1.0);
|
datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution
|
||||||
|
|
||||||
|
// evaluate this random data point
|
||||||
int gaussianIndex = 0;
|
int gaussianIndex = 0;
|
||||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||||
}
|
}
|
||||||
|
|
||||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10));
|
// add this sample's probability to the pile in order to take an average in the end
|
||||||
numVals++;
|
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
|
||||||
|
numSamples++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Math.log10( sumPVarInGaussian / ((double) numVals) );
|
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -25,7 +25,6 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||||
|
|
||||||
import cern.jet.random.Normal;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
|
@ -91,7 +90,7 @@ public class VariantDataManager {
|
||||||
meanVector[iii] = theMean;
|
meanVector[iii] = theMean;
|
||||||
varianceVector[iii] = theSTD;
|
varianceVector[iii] = theSTD;
|
||||||
for( final VariantDatum datum : data ) {
|
for( final VariantDatum datum : data ) {
|
||||||
datum.annotations[iii] = ( datum.isNull[iii] ? Normal.staticNextDouble(0.0, 1.0) : ( datum.annotations[iii] - theMean ) / theSTD );
|
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||||
// Each data point is now [ (x - mean) / standard deviation ]
|
// Each data point is now [ (x - mean) / standard deviation ]
|
||||||
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
if( annotationKeys.get(iii).toLowerCase().contains("ranksum") && datum.isNull[iii] && datum.annotations[iii] > 0.0 ) {
|
||||||
datum.annotations[iii] /= 3.0;
|
datum.annotations[iii] /= 3.0;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue