Intermediate optimization checkin. LinearExact model now about 10-20% faster than previous commit, by reorganizing and optimizing the if statements and genotype likelihood calculations. Next commit will include a banded implementation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5362 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-02 22:01:35 +00:00
parent f0f4bc3363
commit 0181d95fe4
1 changed files with 280 additions and 593 deletions

View File

@ -38,24 +38,32 @@ import java.util.*;
import java.io.PrintStream;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
//
// code for testing purposes
//
private final static boolean DEBUG = false;
private final static boolean PRINT_LIKELIHOODS = false;
private final static int N_CYCLES = 1;
private SimpleTimer timerExpt = new SimpleTimer("linearExact");
private SimpleTimer timerGS = new SimpleTimer("linearExactGS");
private final static boolean COMPARE_TO_GS = false;
public enum ExactCalculation {
N2_GOLD_STANDARD,
LINEAR_EXPERIMENTAL
}
private final static boolean COMPARE_TO_GS = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private boolean SIMPLE_GREEDY_GENOTYPER = false;
private static final double[] log10Cache;
private static final double[] jacobianLogTable;
// private static final int JACOBIAN_LOG_TABLE_SIZE = 100001;
// private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001;
private static final int JACOBIAN_LOG_TABLE_SIZE = 101;
private static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
private static final int MAXN = 10000;
private static final int MAXN = 10000; // todo -- warning, this might be hit at some point...
static {
log10Cache = new double[2*MAXN];
@ -66,8 +74,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
log10Cache[k] = Math.log10(k);
for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k)
* JACOBIAN_LOG_TABLE_STEP));
jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) * JACOBIAN_LOG_TABLE_STEP));
}
@ -91,6 +98,22 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
// todo -- remove me after testing
if ( N_CYCLES > 1 ) {
for ( int i = 0; i < N_CYCLES; i++) {
timerGS.restart();
linearExactClean(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
timerGS.stop();
timerExpt.restart();
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
timerExpt.stop();
}
System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n",
timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime());
}
int lastK = -1;
switch ( calcToUse ) {
case N2_GOLD_STANDARD:
@ -141,37 +164,45 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return genotypeLikelihoods;
}
private static class ExactACCache {
// -------------------------------------------------------------------------------------
//
// Linearized, ~O(N), implementation.
//
// -------------------------------------------------------------------------------------
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private static double[] create(int n, double defaultValue) {
double[] v = new double[n];
Arrays.fill(v, defaultValue);
return v;
private final static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n, double defaultValue) {
kMinus2 = create(n, defaultValue);
kMinus1 = create(n, defaultValue);
kMinus0 = create(n, defaultValue);
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
public void rotate() {
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
public double[] getkMinus2() {
final public double[] getkMinus2() {
return kMinus2;
}
public double[] getkMinus1() {
final public double[] getkMinus1() {
return kMinus1;
}
public double[] getkMinus0() {
final public double[] getkMinus0() {
return kMinus0;
}
}
@ -179,6 +210,109 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
final int numSamples = GLs.size();
final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods[j];
final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
if (k < 2*j)
ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
double log10Max;
if (k > 1) {
final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
final static double approximateLog10SumLog10(double a, double b, double c) {
//return softMax(new double[]{a, b, c});
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
}
final static double approximateLog10SumLog10(double small, double big) {
// make sure small is really the smaller value
if ( small > big ) {
final double t = big;
big = small;
small = t;
}
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
return big;
if (big >= small + MAX_JACOBIAN_TOLERANCE)
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
//final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
int ind = (int)(Math.round((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
return big + jacobianLogTable[ind];
}
public int linearExactClean(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
double[][] genotypeLikelihoods = getGLs(GLs); // todo -- remove me, not sure this is helping
@ -187,7 +321,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// set posteriors to negative infinity by default:
//Arrays.fill(log10AlleleFrequencyPosteriors, Double.NEGATIVE_INFINITY);
ExactACCache logY = new ExactACCache(numSamples+1, Double.NEGATIVE_INFINITY);
ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
@ -251,129 +385,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return lastK;
}
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
double[][] logYMatrix = new double[1+numSamples][1+numChr];
for (int i=0; i <=numSamples; i++)
for (int j=0; j <=numChr; j++)
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//YMatrix[0][0] = 1.0;
logYMatrix[0][0] = 0.0;
int j=0;
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
if ( !sample.getValue().hasLikelihoods() )
continue;
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
for (int k=1; k <= 2*j; k++ ) {
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
double logNumerator[];
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[GenotypeType.AA.ordinal()];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[GenotypeType.AB.ordinal()];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[GenotypeType.BB.ordinal()];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;
double logNum = softMax(logNumerator);
//YMatrix[j][k] = num/den;
logYMatrix[j][k] = logNum - logDenominator;
}
}
for (int k=0; k <= numChr; k++)
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
return numChr;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
int j = logYMatrix.length - 1;
System.out.printf("-----------------------------------%n");
for (int k=0; k <= numChr; k++) {
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
double softMax(double[] vec) {
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
// better approximation: do Jacobian logarithm function on data pairs
double a = softMaxPair(vec[0],vec[1]);
return softMaxPair(a,vec[2]);
}
static public double softMaxPair(double x, double y) {
if (Double.isInfinite(x))
return y;
if (Double.isInfinite(y))
return x;
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
/**
* Can be overridden by concrete subclasses
@ -501,456 +512,132 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return calls;
}
}
// -------------------------------------------------------------------------------------
//
// Gold standard, but O(N^2), implementation.
//
// TODO -- remove me for clarity in this code
//
// -------------------------------------------------------------------------------------
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
// working linearized version
//public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// private final static boolean PRINT_LIKELIHOODS = false;
//
// public enum ExactCalculation {
// N2_GOLD_STANDARD,
// LINEAR_EXPERIMENTAL
// }
//
// private final static boolean COMPARE_TO_GS = false;
// private final static boolean PRINT_MAD_AC_POSTERIORS = false;
// private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
//
// private boolean SIMPLE_GREEDY_GENOTYPER = false;
// private static final double[] log10Cache;
// private static final double[] jacobianLogTable;
// private static final int JACOBIAN_LOG_TABLE_SIZE = 101;
// private static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
// private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
// private static final int MAXN = 10000;
//
// static {
// log10Cache = new double[2*MAXN];
// jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
//
// log10Cache[0] = Double.NEGATIVE_INFINITY;
// for (int k=1; k < 2*MAXN; k++)
// log10Cache[k] = Math.log10(k);
//
// for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
// jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k)
// * JACOBIAN_LOG_TABLE_STEP));
//
// }
//
// }
//
// final private ExactCalculation calcToUse;
// protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
// super(UAC, N, logger, verboseWriter);
// calcToUse = UAC.EXACT_CALCULATION_TYPE;
// }
//
// public void getLog10PNonRef(RefMetaDataTracker tracker,
// ReferenceContext ref,
// Map<String, Genotype> GLs,
// double[] log10AlleleFrequencyPriors,
// double[] log10AlleleFrequencyPosteriors) {
// switch ( calcToUse ) {
// case N2_GOLD_STANDARD:
// gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
// break;
// case LINEAR_EXPERIMENTAL:
// madByAC(ref, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
// break;
// }
// }
//
// private static final double[][] getGLs(Map<String, Genotype> GLs) {
// double[][] genotypeLikelihoods = new double[GLs.size()+1][];
//
// int j = 0;
// for ( Genotype sample : GLs.values() ) {
// j++;
//
// if ( sample.hasLikelihoods() ) {
// //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
// genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector();
// }
// }
//
// return genotypeLikelihoods;
// }
//
// private static class ExactACCache {
// double[] kMinus2, kMinus1, kMinus0;
//
// private static double[] create(int n, double defaultValue) {
// double[] v = new double[n];
// Arrays.fill(v, defaultValue);
// return v;
// }
//
// public ExactACCache(int nSamples, double defaultValue) {
// kMinus2 = create(nSamples, defaultValue);
// kMinus1 = create(nSamples, defaultValue);
// kMinus0 = create(nSamples, defaultValue);
// }
//
// public void rotate() {
// double[] tmp = kMinus2;
// kMinus2 = kMinus1;
// kMinus1 = kMinus0;
// kMinus0 = tmp;
// }
//
// public double[] getkMinus2() {
// return kMinus2;
// }
//
// public double[] getkMinus1() {
// return kMinus1;
// }
//
// public double[] getkMinus0() {
// return kMinus0;
// }
// }
//
// public void madByAC(ReferenceContext ref,
// Map<String, Genotype> GLs,
// double[] log10AlleleFrequencyPriors,
// double[] log10AlleleFrequencyPosteriors) {
// // todo -- remove me after testing
// double[] gsPosteriors = log10AlleleFrequencyPosteriors;
// if ( COMPARE_TO_GS ) {
// gsPosteriors = log10AlleleFrequencyPosteriors.clone();
// gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors);
// }
//
// int numSamples = GLs.size();
// int numChr = 2*numSamples;
// double[][] genotypeLikelihoods = getGLs(GLs); // todo -- remove me, not sure this is helping
//
// // set posteriors to negative infinity by default:
// Arrays.fill(log10AlleleFrequencyPosteriors, Double.NEGATIVE_INFINITY);
//
// // todo -- replace this matrix with 3 vectors (k, k-1, k-2) and cycle through them
// // todo -- this is *CRITICAL* to reduce the algorithm to a true ~linear algorithm
// double[][] logYMatrix = new double[1+numSamples][1+numChr];
// for (int i=0; i <= numSamples; i++) // initialize
// Arrays.fill(logYMatrix[i], Double.NEGATIVE_INFINITY);
// logYMatrix[0][0] = 0.0; // the zero case
//
// double maxLog10L = Double.NEGATIVE_INFINITY;
// boolean done = false;
// int lastK = -1;
//
// // todo -- we may be able to start second loop some way down the calculation, since GdAs loop only
// // todo -- considers part of the matrix as well
// for (int k=0; k <= numChr && ! done; k++ ) {
// if ( k == 0 ) {
// // special case for k = 0
// for ( int j=1; j <= numSamples; j++ ) {
// logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
// }
// } else { // k > 0
// for ( int j=1; j <= numSamples; j++ ) {
// double[] gl = genotypeLikelihoods[j];
// double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
//
// double[] logNumerator = {Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY};
// if (k < 2*j-1)
// logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
// gl[GenotypeType.AA.ordinal()];
//
// if (k < 2*j)
// logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
// gl[GenotypeType.AB.ordinal()];
//
// if (k > 1)
// logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
// gl[GenotypeType.BB.ordinal()];
//
// logYMatrix[j][k] = softMax(logNumerator) - logDenominator;
// }
// }
//
// // update the posteriors vector
// double log10LofK = logYMatrix[numSamples][k];
// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
//
// // can we abort early?
// lastK = k;
// maxLog10L = Math.max(maxLog10L, log10LofK);
// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
// if ( PRINT_MAD_AC_POSTERIORS )
// System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
// done = true;
// }
// }
//
// if ( PRINT_MAD_AC_POSTERIORS ) {
// System.out.printf("----------------------------------------%n");
// for (int k=0; k <= numChr; k++) {
// System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k,
// log10AlleleFrequencyPosteriors[k], gsPosteriors[k],
// log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]);
// }
// double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
// double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
// System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n",
// ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar);
// }
//
// if ( PRINT_LIKELIHOODS ) printLikelihoods(numChr, logYMatrix, log10AlleleFrequencyPriors);
// }
//
//
// public void gdaN2GoldStandard(Map<String, Genotype> GLs,
// double[] log10AlleleFrequencyPriors,
// double[] log10AlleleFrequencyPosteriors) {
// int numSamples = GLs.size();
// int numChr = 2*numSamples;
//
// double[][] logYMatrix = new double[1+numSamples][1+numChr];
//
// for (int i=0; i <=numSamples; i++)
// for (int j=0; j <=numChr; j++)
// logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//
// //YMatrix[0][0] = 1.0;
// logYMatrix[0][0] = 0.0;
// int j=0;
//
// for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
// j++;
//
// if ( !sample.getValue().hasLikelihoods() )
// continue;
//
// //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
// double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
// //double logDenominator = Math.log10(2.0*j*(2.0*j-1));
// double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
//
// // special treatment for k=0: iteration reduces to:
// //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
// logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
//
// for (int k=1; k <= 2*j; k++ ) {
//
// //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
// double logNumerator[];
// logNumerator = new double[3];
// if (k < 2*j-1)
// logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
// genotypeLikelihoods[GenotypeType.AA.ordinal()];
// else
// logNumerator[0] = Double.NEGATIVE_INFINITY;
//
//
// if (k < 2*j)
// logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
// genotypeLikelihoods[GenotypeType.AB.ordinal()];
// else
// logNumerator[1] = Double.NEGATIVE_INFINITY;
//
// if (k > 1)
// logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
// genotypeLikelihoods[GenotypeType.BB.ordinal()];
// else
// logNumerator[2] = Double.NEGATIVE_INFINITY;
//
// double logNum = softMax(logNumerator);
//
// //YMatrix[j][k] = num/den;
// logYMatrix[j][k] = logNum - logDenominator;
// }
//
// }
//
//
// for (int k=0; k <= numChr; k++)
// log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
//
// if ( PRINT_LIKELIHOODS ) printLikelihoods(numChr, logYMatrix, log10AlleleFrequencyPriors);
// }
//
// private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
// int j = logYMatrix.length - 1;
// System.out.printf("-----------------------------------%n");
// for (int k=0; k <= numChr; k++) {
// double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
// System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
// }
// }
//
// double softMax(double[] vec) {
// // compute naively log10(10^x[0] + 10^x[1]+...)
// // return Math.log10(MathUtils.sumLog10(vec));
//
// // better approximation: do Jacobian logarithm function on data pairs
// double a = softMaxPair(vec[0],vec[1]);
// return softMaxPair(a,vec[2]);
// }
//
// static public double softMaxPair(double x, double y) {
// if (Double.isInfinite(x))
// return y;
//
// if (Double.isInfinite(y))
// return x;
//
// if (y >= x + MAX_JACOBIAN_TOLERANCE)
// return y;
// if (x >= y + MAX_JACOBIAN_TOLERANCE)
// return x;
//
// // OK, so |y-x| < tol: we use the following identity then:
// // we need to compute log10(10^x + 10^y)
// // By Jacobian logarithm identity, this is equal to
// // max(x,y) + log10(1+10^-abs(x-y))
// // we compute the second term as a table lookup
// // with integer quantization
// double diff = Math.abs(x-y);
// double t1 =x;
// if (y > x)
// t1 = y;
// // t has max(x,y)
// // we have pre-stored correction for 0,0.1,0.2,... 10.0
// int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
// double t2 = jacobianLogTable[ind];
//
// // gdebug+
// //double z =Math.log10(1+Math.pow(10.0,-diff));
// //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
// //gdebug-
// return t1+t2;
// // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
// }
//
//
//
// /**
// * Can be overridden by concrete subclasses
// * @param vc variant context with genotype likelihoods
// * @param log10AlleleFrequencyPosteriors allele frequency results
// * @param AFofMaxLikelihood allele frequency of max likelihood
// *
// * @return calls
// */
// public Map<String, Genotype> assignGenotypes(VariantContext vc,
// double[] log10AlleleFrequencyPosteriors,
// int AFofMaxLikelihood) {
// if ( !vc.isVariant() )
// throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
//
// Allele refAllele = vc.getReference();
// Allele altAllele = vc.getAlternateAllele(0);
//
// Map<String, Genotype> GLs = vc.getGenotypes();
// double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
// int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
//
// ArrayList<String> sampleIndices = new ArrayList<String>();
// int sampleIdx = 0;
//
// // todo - optimize initialization
// for (int k=0; k <= AFofMaxLikelihood; k++)
// for (int j=0; j <= GLs.size(); j++)
// pathMetricArray[j][k] = -1e30;
//
// pathMetricArray[0][0] = 0.0;
//
// if (SIMPLE_GREEDY_GENOTYPER) {
// sampleIndices.addAll(GLs.keySet());
// sampleIdx = GLs.size();
// }
// else {
//
// for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
// if ( !sample.getValue().hasLikelihoods() )
// continue;
//
// double[] likelihoods = sample.getValue().getLikelihoods().getAsVector();
// sampleIndices.add(sample.getKey());
//
// for (int k=0; k <= AFofMaxLikelihood; k++) {
//
// double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
// int bestIndex = k;
//
// if (k>0) {
// double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
// if (m2 > bestMetric) {
// bestMetric = m2;
// bestIndex = k-1;
// }
// }
//
// if (k>1) {
// double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
// if (m2 > bestMetric) {
// bestMetric = m2;
// bestIndex = k-2;
// }
// }
//
// pathMetricArray[sampleIdx+1][k] = bestMetric;
// tracebackArray[sampleIdx+1][k] = bestIndex;
// }
// sampleIdx++;
// }
// }
//
// HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
//
// int startIdx = AFofMaxLikelihood;
// for (int k = sampleIdx; k > 0; k--) {
// int bestGTguess;
// String sample = sampleIndices.get(k-1);
// Genotype g = GLs.get(sample);
// if ( !g.hasLikelihoods() )
// continue;
//
// if (SIMPLE_GREEDY_GENOTYPER)
// bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
// else {
// int newIdx = tracebackArray[k][startIdx];
// bestGTguess = startIdx - newIdx;
// startIdx = newIdx;
// }
//
// ArrayList<Allele> myAlleles = new ArrayList<Allele>();
//
// double qual;
// double[] likelihoods = g.getLikelihoods().getAsVector();
//
// if (bestGTguess == 0) {
// myAlleles.add(refAllele);
// myAlleles.add(refAllele);
// qual = likelihoods[0] - Math.max(likelihoods[1], likelihoods[2]);
// } else if(bestGTguess == 1) {
// myAlleles.add(refAllele);
// myAlleles.add(altAllele);
// qual = likelihoods[1] - Math.max(likelihoods[0], likelihoods[2]);
//
// } else {
// myAlleles.add(altAllele);
// myAlleles.add(altAllele);
// qual = likelihoods[2] - Math.max(likelihoods[1], likelihoods[0]);
// }
//
//
// if (qual < 0) {
// // QUAL can be negative if the chosen genotype is not the most likely one individually.
// // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on
// double[] normalized = MathUtils.normalizeFromLog10(likelihoods);
// double chosenGenotype = normalized[bestGTguess];
// qual = -1.0 * Math.log10(1.0 - chosenGenotype);
// }
//
// calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
//
// }
//
// return calls;
// }
//
//}
double[][] logYMatrix = new double[1+numSamples][1+numChr];
for (int i=0; i <=numSamples; i++)
for (int j=0; j <=numChr; j++)
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//YMatrix[0][0] = 1.0;
logYMatrix[0][0] = 0.0;
int j=0;
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
j++;
if ( !sample.getValue().hasLikelihoods() )
continue;
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
for (int k=1; k <= 2*j; k++ ) {
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
double logNumerator[];
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[GenotypeType.AA.ordinal()];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[GenotypeType.AB.ordinal()];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[GenotypeType.BB.ordinal()];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;
double logNum = softMax(logNumerator);
//YMatrix[j][k] = num/den;
logYMatrix[j][k] = logNum - logDenominator;
}
}
for (int k=0; k <= numChr; k++)
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
return numChr;
}
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
int j = logYMatrix.length - 1;
System.out.printf("-----------------------------------%n");
for (int k=0; k <= numChr; k++) {
double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
}
}
double softMax(double[] vec) {
// compute naively log10(10^x[0] + 10^x[1]+...)
// return Math.log10(MathUtils.sumLog10(vec));
// better approximation: do Jacobian logarithm function on data pairs
double a = softMaxPair(vec[0],vec[1]);
return softMaxPair(a,vec[2]);
}
static public double softMaxPair(double x, double y) {
if (Double.isInfinite(x))
return y;
if (Double.isInfinite(y))
return x;
if (y >= x + MAX_JACOBIAN_TOLERANCE)
return y;
if (x >= y + MAX_JACOBIAN_TOLERANCE)
return x;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup
// with integer quantization
double diff = Math.abs(x-y);
double t1 =x;
if (y > x)
t1 = y;
// t has max(x,y)
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP);
double t2 = jacobianLogTable[ind];
// gdebug+
//double z =Math.log10(1+Math.pow(10.0,-diff));
//System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind);
//gdebug-
return t1+t2;
// return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
}
}