Moving some math around in VQSRv2.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5431 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2011-03-12 15:15:05 +00:00
parent d01d4fdeb5
commit ceb08f9ee6
3 changed files with 42 additions and 23 deletions

View File

@ -28,8 +28,9 @@ public class GaussianMixtureModel {
private final ArrayList<MultivariateGaussian> gaussians;
private final double shrinkage;
private final double dirichletParameter;
private final double degreesOfFreedom;
private final double[] empiricalMu; // BUGBUG: move these to VariantData class
private final Matrix empiricalSigma;
private final Matrix empiricalSigma; // BUGBUG: move these to VariantData class
public boolean isModelReadyForEvaluation;
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
@ -42,6 +43,7 @@ public class GaussianMixtureModel {
}
this.shrinkage = shrinkage;
this.dirichletParameter = dirichletParameter;
degreesOfFreedom = numAnnotations;
empiricalMu = new double[numAnnotations];
empiricalSigma = new Matrix(numAnnotations, numAnnotations);
isModelReadyForEvaluation = false;
@ -57,27 +59,49 @@ public class GaussianMixtureModel {
for( int iii = 0; iii < annotationLines.size(); iii++ ) {
gaussian.mu[iii] = Double.parseDouble(vals[2+iii]); //BUGBUG: recreated here to match the integration tests
for( int jjj = 0; jjj < annotationLines.size(); jjj++ ) {
gaussian.sigma.set(iii, jjj, Double.parseDouble(vals[2+annotationLines.size()+(iii*annotationLines.size())+jjj]) * 1.3); // BUGBUG: VRAC backOff, or get rid of this completely!?
gaussian.sigma.set(iii, jjj, 1.3*Double.parseDouble(vals[2+annotationLines.size()+(iii*annotationLines.size())+jjj])); //BUGBUG: VRAC backoff
}
}
gaussians.add( gaussian );
}
this.shrinkage = 0.0; // not used when evaluating data, BUGBUG: move this to VariantData class
this.dirichletParameter = 0.0; // not used when evaluating data
shrinkage = 0.0; // not used when evaluating data, BUGBUG: move this to VariantData class
dirichletParameter = 0.0; // not used when evaluating data
degreesOfFreedom = 0.0; // not used when evaluating data
empiricalMu = null; // not used when evaluating data
empiricalSigma = null; // not used when evaluating data
isModelReadyForEvaluation = false;
}
public void cacheEmpiricalStats( final List<VariantDatum> data ) {
//final double[][] tmpSigmaVals = new double[empiricalMu.length][empiricalMu.length];
for( int iii = 0; iii < empiricalMu.length; iii++ ) {
empiricalMu[iii] = 0.0;
//for( int jjj = iii; jjj < empiricalMu.length; jjj++ ) {
// tmpSigmaVals[iii][jjj] = 0.0;
//}
}
for( final VariantDatum datum : data ) {
for( int jjj = 0; jjj < empiricalMu.length; jjj++ ) {
empiricalMu[jjj] += datum.annotations[jjj] / ((double) data.size());
for( int iii = 0; iii < empiricalMu.length; iii++ ) {
empiricalMu[iii] += datum.annotations[iii] / ((double) data.size());
}
}
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, Matrix.identity(empiricalMu.length, empiricalMu.length)); // BUGBUG: why does the identity matrix work best here?
// is it because of a bug in the old implementation in which std>X variants were still counted in this calculation??
/*
for( final VariantDatum datum : data ) {
for( int iii = 0; iii < empiricalMu.length; iii++ ) {
for( int jjj = 0; jjj < empiricalMu.length; jjj++ ) {
tmpSigmaVals[iii][jjj] += (datum.annotations[iii]-empiricalMu[iii]) * (datum.annotations[jjj]-empiricalMu[jjj]);
}
}
}
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, new Matrix(tmpSigmaVals));
empiricalSigma.timesEquals( 1.0 / ((double) data.size()) );
empiricalSigma.timesEquals( 1.0 / (Math.pow(gaussians.size(), 2.0 / ((double) empiricalMu.length))) );
*/
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, Matrix.identity(empiricalMu.length, empiricalMu.length));
}
public void initializeRandomModel( final List<VariantDatum> data, final Random rand ) {
@ -94,7 +118,7 @@ public class GaussianMixtureModel {
for( final MultivariateGaussian gaussian : gaussians ) {
gaussian.pMixtureLog10 = Math.log10( 1.0 / ((double) gaussians.size()) );
gaussian.initializeRandomSigma( rand );
gaussian.hyperParameter_a = gaussian.mu.length;
gaussian.hyperParameter_a = degreesOfFreedom;
gaussian.hyperParameter_b = shrinkage;
gaussian.hyperParameter_lambda = dirichletParameter;
}
@ -164,7 +188,7 @@ public class GaussianMixtureModel {
public void maximizationStep( final List<VariantDatum> data ) {
for( final MultivariateGaussian gaussian : gaussians ) {
gaussian.maximizeGaussian( data, empiricalMu, empiricalSigma, shrinkage, dirichletParameter );
gaussian.maximizeGaussian( data, empiricalMu, empiricalSigma, shrinkage, dirichletParameter, degreesOfFreedom );
}
}
@ -186,9 +210,9 @@ public class GaussianMixtureModel {
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
clusterFile.print(String.format(",%.8f", gaussian.mu[jjj]));
}
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
for(int ppp = 0; ppp < gaussian.mu.length; ppp++ ) {
clusterFile.print(String.format(",%.8f", (sigmaVals[jjj][ppp] / gaussian.hyperParameter_a) )); // BUGBUG: this is a bug which should be fixed after passing integration tests
for(int iii = 0; iii < gaussian.mu.length; iii++ ) {
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
clusterFile.print(String.format(",%.8f", (sigmaVals[iii][jjj] / gaussian.hyperParameter_a)));
}
}
clusterFile.println();

View File

@ -18,7 +18,6 @@ public class MultivariateGaussian {
public double pMixtureLog10;
final public double[] mu;
final public Matrix sigma;
private double cachedDeterminant;
public double hyperParameter_a;
public double hyperParameter_b;
public double hyperParameter_lambda;
@ -65,7 +64,6 @@ public class MultivariateGaussian {
Matrix tmp = new Matrix( randSigma );
tmp = tmp.times(tmp.transpose());
sigma.setMatrix(0, mu.length - 1, 0, mu.length - 1, tmp);
cachedDeterminant = sigma.det();
}
public double calculateDistanceFromMeanSquared( final VariantDatum datum ) {
@ -90,9 +88,7 @@ public class MultivariateGaussian {
public void precomputeDenominatorForEvaluation() {
cachedSigmaInverse = sigma.inverse();
cachedDenomLog10 = -1.0 * ( Math.log10(Math.pow(2.0 * Math.PI, ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), 0.5)) );
//BUGBUG: This should be determinant of sigma inverse?
//BUGBUG: Denom --> constant factor log10
cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ;
}
public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
@ -102,7 +98,7 @@ public class MultivariateGaussian {
for(int jjj = 1; jjj < mu.length; jjj++) {
sum += MathUtils.diGamma( (hyperParameter_a + 1.0 - jjj) / 2.0 );
}
sum -= Math.log( cachedDeterminant );
sum -= Math.log( sigma.det() );
sum += Math.log(2.0) * mu.length;
final double gamma = 0.5 * sum;
final double pi = MathUtils.diGamma( hyperParameter_lambda ) - MathUtils.diGamma( sumHyperParameterLambda );
@ -135,7 +131,7 @@ public class MultivariateGaussian {
}
public void maximizeGaussian( final List<VariantDatum> data, final double[] empiricalMu, final Matrix empiricalSigma,
final double SHRINKAGE, final double DIRICHLET_PARAMETER ) {
final double SHRINKAGE, final double DIRICHLET_PARAMETER, final double DEGREES_OF_FREEDOM ) {
double sumProb = 0.0;
Matrix wishart = new Matrix(mu.length, mu.length);
zeroOutMu();
@ -173,11 +169,10 @@ public class MultivariateGaussian {
sigma.plusEquals( empiricalSigma );
sigma.plusEquals( wishart );
cachedDeterminant = sigma.det();
pMixtureLog10 = sumProb; // will be normalized later by GaussianMixtureModel so no need to do it every iteration
hyperParameter_a = sumProb + mu.length;
hyperParameter_a = sumProb + DEGREES_OF_FREEDOM;
hyperParameter_b = sumProb + SHRINKAGE;
hyperParameter_lambda = sumProb + DIRICHLET_PARAMETER;

View File

@ -40,7 +40,7 @@ public class VariantRecalibratorEngine {
}
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
final GaussianMixtureModel model = new GaussianMixtureModel( 4, 3, 0.0001, 1000.0 ); //BUGBUG: VRAC.maxGaussians, VRAC.numAnnotations
final GaussianMixtureModel model = new GaussianMixtureModel( 4, 3, 0.0001, 1000.0 ); //BUGBUG: VRAC arguments
variationalBayesExpectationMaximization( model, data );
return model;
}