diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index d5c62cc8d..b82679959 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -38,24 +38,32 @@ import java.util.*; import java.io.PrintStream; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { + // + // code for testing purposes + // private final static boolean DEBUG = false; private final static boolean PRINT_LIKELIHOODS = false; + private final static int N_CYCLES = 1; + private SimpleTimer timerExpt = new SimpleTimer("linearExact"); + private SimpleTimer timerGS = new SimpleTimer("linearExactGS"); + private final static boolean COMPARE_TO_GS = false; public enum ExactCalculation { N2_GOLD_STANDARD, LINEAR_EXPERIMENTAL } - private final static boolean COMPARE_TO_GS = false; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private boolean SIMPLE_GREEDY_GENOTYPER = false; private static final double[] log10Cache; private static final double[] jacobianLogTable; +// private static final int JACOBIAN_LOG_TABLE_SIZE = 100001; +// private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; private static final int JACOBIAN_LOG_TABLE_SIZE = 101; private static final double JACOBIAN_LOG_TABLE_STEP = 0.1; private static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 10000; + private static final int MAXN = 10000; // todo -- warning, this might be hit at some point... static { log10Cache = new double[2*MAXN]; @@ -66,8 +74,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { log10Cache[k] = Math.log10(k); for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) - * JACOBIAN_LOG_TABLE_STEP)); + jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) * JACOBIAN_LOG_TABLE_STEP)); } @@ -91,6 +98,22 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here gsPosteriors = log10AlleleFrequencyPosteriors.clone(); + // todo -- remove me after testing + if ( N_CYCLES > 1 ) { + for ( int i = 0; i < N_CYCLES; i++) { + timerGS.restart(); + linearExactClean(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone()); + timerGS.stop(); + + timerExpt.restart(); + linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone()); + timerExpt.stop(); + } + + System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n", + timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime()); + } + int lastK = -1; switch ( calcToUse ) { case N2_GOLD_STANDARD: @@ -141,37 +164,45 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return genotypeLikelihoods; } - private static class ExactACCache { + // ------------------------------------------------------------------------------------- + // + // Linearized, ~O(N), implementation. + // + // ------------------------------------------------------------------------------------- + + /** + * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors + * for the exact model calculation + */ + private final static class ExactACCache { double[] kMinus2, kMinus1, kMinus0; - private static double[] create(int n, double defaultValue) { - double[] v = new double[n]; - Arrays.fill(v, defaultValue); - return v; + private final static double[] create(int n) { + return new double[n]; } - public ExactACCache(int n, double defaultValue) { - kMinus2 = create(n, defaultValue); - kMinus1 = create(n, defaultValue); - kMinus0 = create(n, defaultValue); + public ExactACCache(int n) { + kMinus2 = create(n); + kMinus1 = create(n); + kMinus0 = create(n); } - public void rotate() { + final public void rotate() { double[] tmp = kMinus2; kMinus2 = kMinus1; kMinus1 = kMinus0; kMinus0 = tmp; } - public double[] getkMinus2() { + final public double[] getkMinus2() { return kMinus2; } - public double[] getkMinus1() { + final public double[] getkMinus1() { return kMinus1; } - public double[] getkMinus0() { + final public double[] getkMinus0() { return kMinus0; } } @@ -179,6 +210,109 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { + final int numSamples = GLs.size(); + final int numChr = 2*numSamples; + final double[][] genotypeLikelihoods = getGLs(GLs); + + final ExactACCache logY = new ExactACCache(numSamples+1); + logY.getkMinus0()[0] = 0.0; // the zero case + + double maxLog10L = Double.NEGATIVE_INFINITY; + boolean done = false; + int lastK = -1; + + for (int k=0; k <= numChr && ! done; k++ ) { + final double[] kMinus0 = logY.getkMinus0(); + + if ( k == 0 ) { // special case for k = 0 + for ( int j=1; j <= numSamples; j++ ) { + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()]; + } + } else { // k > 0 + final double[] kMinus1 = logY.getkMinus1(); + final double[] kMinus2 = logY.getkMinus2(); + + for ( int j=1; j <= numSamples; j++ ) { + final double[] gl = genotypeLikelihoods[j]; + final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; + + double aa = Double.NEGATIVE_INFINITY; + double ab = Double.NEGATIVE_INFINITY; + if (k < 2*j-1) + aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()]; + + if (k < 2*j) + ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()]; + + double log10Max; + if (k > 1) { + final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()]; + log10Max = approximateLog10SumLog10(aa, ab, bb); + } else { + // we know we aren't considering the BB case, so we can use an optimized log10 function + log10Max = approximateLog10SumLog10(aa, ab); + } + + // finally, update the L(j,k) value + kMinus0[j] = log10Max - logDenominator; + } + } + + // update the posteriors vector + final double log10LofK = kMinus0[numSamples]; + log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; + + // can we abort early? + lastK = k; + maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + done = true; + } + + logY.rotate(); + } + + return lastK; + } + + final static double approximateLog10SumLog10(double a, double b, double c) { + //return softMax(new double[]{a, b, c}); + return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); + } + + final static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if ( small > big ) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) + return big; + + if (big >= small + MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding + int ind = (int)(Math.round((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding + + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + return big + jacobianLogTable[ind]; + } + + public int linearExactClean(Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors) { int numSamples = GLs.size(); int numChr = 2*numSamples; double[][] genotypeLikelihoods = getGLs(GLs); // todo -- remove me, not sure this is helping @@ -187,7 +321,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // set posteriors to negative infinity by default: //Arrays.fill(log10AlleleFrequencyPosteriors, Double.NEGATIVE_INFINITY); - ExactACCache logY = new ExactACCache(numSamples+1, Double.NEGATIVE_INFINITY); + ExactACCache logY = new ExactACCache(numSamples+1); logY.getkMinus0()[0] = 0.0; // the zero case double maxLog10L = Double.NEGATIVE_INFINITY; @@ -251,129 +385,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return lastK; } - public int gdaN2GoldStandard(Map GLs, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { - int numSamples = GLs.size(); - int numChr = 2*numSamples; - - double[][] logYMatrix = new double[1+numSamples][1+numChr]; - - for (int i=0; i <=numSamples; i++) - for (int j=0; j <=numChr; j++) - logYMatrix[i][j] = Double.NEGATIVE_INFINITY; - - //YMatrix[0][0] = 1.0; - logYMatrix[0][0] = 0.0; - int j=0; - - for ( Map.Entry sample : GLs.entrySet() ) { - j++; - - if ( !sample.getValue().hasLikelihoods() ) - continue; - - //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector(); - //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); - double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; - - // special treatment for k=0: iteration reduces to: - //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; - logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; - - for (int k=1; k <= 2*j; k++ ) { - - //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; - double logNumerator[]; - logNumerator = new double[3]; - if (k < 2*j-1) - logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] + - genotypeLikelihoods[GenotypeType.AA.ordinal()]; - else - logNumerator[0] = Double.NEGATIVE_INFINITY; - - - if (k < 2*j) - logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + - genotypeLikelihoods[GenotypeType.AB.ordinal()]; - else - logNumerator[1] = Double.NEGATIVE_INFINITY; - - if (k > 1) - logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] + - genotypeLikelihoods[GenotypeType.BB.ordinal()]; - else - logNumerator[2] = Double.NEGATIVE_INFINITY; - - double logNum = softMax(logNumerator); - - //YMatrix[j][k] = num/den; - logYMatrix[j][k] = logNum - logDenominator; - } - - } - - for (int k=0; k <= numChr; k++) - log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - - return numChr; - } - - private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { - int j = logYMatrix.length - 1; - System.out.printf("-----------------------------------%n"); - for (int k=0; k <= numChr; k++) { - double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); - } - } - - double softMax(double[] vec) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - - // better approximation: do Jacobian logarithm function on data pairs - double a = softMaxPair(vec[0],vec[1]); - return softMaxPair(a,vec[2]); - } - - static public double softMaxPair(double x, double y) { - if (Double.isInfinite(x)) - return y; - - if (Double.isInfinite(y)) - return x; - - if (y >= x + MAX_JACOBIAN_TOLERANCE) - return y; - if (x >= y + MAX_JACOBIAN_TOLERANCE) - return x; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - double diff = Math.abs(x-y); - double t1 =x; - if (y > x) - t1 = y; - // t has max(x,y) - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); - double t2 = jacobianLogTable[ind]; - - // gdebug+ - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - //gdebug- - return t1+t2; - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - } - - /** * Can be overridden by concrete subclasses @@ -501,456 +512,132 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return calls; } -} + // ------------------------------------------------------------------------------------- + // + // Gold standard, but O(N^2), implementation. + // + // TODO -- remove me for clarity in this code + // + // ------------------------------------------------------------------------------------- + public int gdaN2GoldStandard(Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors) { + int numSamples = GLs.size(); + int numChr = 2*numSamples; -// working linearized version -//public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { -// private final static boolean PRINT_LIKELIHOODS = false; -// -// public enum ExactCalculation { -// N2_GOLD_STANDARD, -// LINEAR_EXPERIMENTAL -// } -// -// private final static boolean COMPARE_TO_GS = false; -// private final static boolean PRINT_MAD_AC_POSTERIORS = false; -// private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 -// -// private boolean SIMPLE_GREEDY_GENOTYPER = false; -// private static final double[] log10Cache; -// private static final double[] jacobianLogTable; -// private static final int JACOBIAN_LOG_TABLE_SIZE = 101; -// private static final double JACOBIAN_LOG_TABLE_STEP = 0.1; -// private static final double MAX_JACOBIAN_TOLERANCE = 10.0; -// private static final int MAXN = 10000; -// -// static { -// log10Cache = new double[2*MAXN]; -// jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; -// -// log10Cache[0] = Double.NEGATIVE_INFINITY; -// for (int k=1; k < 2*MAXN; k++) -// log10Cache[k] = Math.log10(k); -// -// for (int k=0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { -// jacobianLogTable[k] = Math.log10(1.0+Math.pow(10.0,-((double)k) -// * JACOBIAN_LOG_TABLE_STEP)); -// -// } -// -// } -// -// final private ExactCalculation calcToUse; -// protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { -// super(UAC, N, logger, verboseWriter); -// calcToUse = UAC.EXACT_CALCULATION_TYPE; -// } -// -// public void getLog10PNonRef(RefMetaDataTracker tracker, -// ReferenceContext ref, -// Map GLs, -// double[] log10AlleleFrequencyPriors, -// double[] log10AlleleFrequencyPosteriors) { -// switch ( calcToUse ) { -// case N2_GOLD_STANDARD: -// gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); -// break; -// case LINEAR_EXPERIMENTAL: -// madByAC(ref, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); -// break; -// } -// } -// -// private static final double[][] getGLs(Map GLs) { -// double[][] genotypeLikelihoods = new double[GLs.size()+1][]; -// -// int j = 0; -// for ( Genotype sample : GLs.values() ) { -// j++; -// -// if ( sample.hasLikelihoods() ) { -// //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); -// genotypeLikelihoods[j] = sample.getLikelihoods().getAsVector(); -// } -// } -// -// return genotypeLikelihoods; -// } -// -// private static class ExactACCache { -// double[] kMinus2, kMinus1, kMinus0; -// -// private static double[] create(int n, double defaultValue) { -// double[] v = new double[n]; -// Arrays.fill(v, defaultValue); -// return v; -// } -// -// public ExactACCache(int nSamples, double defaultValue) { -// kMinus2 = create(nSamples, defaultValue); -// kMinus1 = create(nSamples, defaultValue); -// kMinus0 = create(nSamples, defaultValue); -// } -// -// public void rotate() { -// double[] tmp = kMinus2; -// kMinus2 = kMinus1; -// kMinus1 = kMinus0; -// kMinus0 = tmp; -// } -// -// public double[] getkMinus2() { -// return kMinus2; -// } -// -// public double[] getkMinus1() { -// return kMinus1; -// } -// -// public double[] getkMinus0() { -// return kMinus0; -// } -// } -// -// public void madByAC(ReferenceContext ref, -// Map GLs, -// double[] log10AlleleFrequencyPriors, -// double[] log10AlleleFrequencyPosteriors) { -// // todo -- remove me after testing -// double[] gsPosteriors = log10AlleleFrequencyPosteriors; -// if ( COMPARE_TO_GS ) { -// gsPosteriors = log10AlleleFrequencyPosteriors.clone(); -// gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors); -// } -// -// int numSamples = GLs.size(); -// int numChr = 2*numSamples; -// double[][] genotypeLikelihoods = getGLs(GLs); // todo -- remove me, not sure this is helping -// -// // set posteriors to negative infinity by default: -// Arrays.fill(log10AlleleFrequencyPosteriors, Double.NEGATIVE_INFINITY); -// -// // todo -- replace this matrix with 3 vectors (k, k-1, k-2) and cycle through them -// // todo -- this is *CRITICAL* to reduce the algorithm to a true ~linear algorithm -// double[][] logYMatrix = new double[1+numSamples][1+numChr]; -// for (int i=0; i <= numSamples; i++) // initialize -// Arrays.fill(logYMatrix[i], Double.NEGATIVE_INFINITY); -// logYMatrix[0][0] = 0.0; // the zero case -// -// double maxLog10L = Double.NEGATIVE_INFINITY; -// boolean done = false; -// int lastK = -1; -// -// // todo -- we may be able to start second loop some way down the calculation, since GdAs loop only -// // todo -- considers part of the matrix as well -// for (int k=0; k <= numChr && ! done; k++ ) { -// if ( k == 0 ) { -// // special case for k = 0 -// for ( int j=1; j <= numSamples; j++ ) { -// logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()]; -// } -// } else { // k > 0 -// for ( int j=1; j <= numSamples; j++ ) { -// double[] gl = genotypeLikelihoods[j]; -// double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; -// -// double[] logNumerator = {Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY}; -// if (k < 2*j-1) -// logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] + -// gl[GenotypeType.AA.ordinal()]; -// -// if (k < 2*j) -// logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + -// gl[GenotypeType.AB.ordinal()]; -// -// if (k > 1) -// logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] + -// gl[GenotypeType.BB.ordinal()]; -// -// logYMatrix[j][k] = softMax(logNumerator) - logDenominator; -// } -// } -// -// // update the posteriors vector -// double log10LofK = logYMatrix[numSamples][k]; -// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; -// -// // can we abort early? -// lastK = k; -// maxLog10L = Math.max(maxLog10L, log10LofK); -// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { -// if ( PRINT_MAD_AC_POSTERIORS ) -// System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); -// done = true; -// } -// } -// -// if ( PRINT_MAD_AC_POSTERIORS ) { -// System.out.printf("----------------------------------------%n"); -// for (int k=0; k <= numChr; k++) { -// System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k, -// log10AlleleFrequencyPosteriors[k], gsPosteriors[k], -// log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]); -// } -// double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]); -// double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]); -// System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n", -// ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar); -// } -// -// if ( PRINT_LIKELIHOODS ) printLikelihoods(numChr, logYMatrix, log10AlleleFrequencyPriors); -// } -// -// -// public void gdaN2GoldStandard(Map GLs, -// double[] log10AlleleFrequencyPriors, -// double[] log10AlleleFrequencyPosteriors) { -// int numSamples = GLs.size(); -// int numChr = 2*numSamples; -// -// double[][] logYMatrix = new double[1+numSamples][1+numChr]; -// -// for (int i=0; i <=numSamples; i++) -// for (int j=0; j <=numChr; j++) -// logYMatrix[i][j] = Double.NEGATIVE_INFINITY; -// -// //YMatrix[0][0] = 1.0; -// logYMatrix[0][0] = 0.0; -// int j=0; -// -// for ( Map.Entry sample : GLs.entrySet() ) { -// j++; -// -// if ( !sample.getValue().hasLikelihoods() ) -// continue; -// -// //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); -// double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector(); -// //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); -// double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; -// -// // special treatment for k=0: iteration reduces to: -// //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; -// logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; -// -// for (int k=1; k <= 2*j; k++ ) { -// -// //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; -// double logNumerator[]; -// logNumerator = new double[3]; -// if (k < 2*j-1) -// logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] + -// genotypeLikelihoods[GenotypeType.AA.ordinal()]; -// else -// logNumerator[0] = Double.NEGATIVE_INFINITY; -// -// -// if (k < 2*j) -// logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + -// genotypeLikelihoods[GenotypeType.AB.ordinal()]; -// else -// logNumerator[1] = Double.NEGATIVE_INFINITY; -// -// if (k > 1) -// logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] + -// genotypeLikelihoods[GenotypeType.BB.ordinal()]; -// else -// logNumerator[2] = Double.NEGATIVE_INFINITY; -// -// double logNum = softMax(logNumerator); -// -// //YMatrix[j][k] = num/den; -// logYMatrix[j][k] = logNum - logDenominator; -// } -// -// } -// -// -// for (int k=0; k <= numChr; k++) -// log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; -// -// if ( PRINT_LIKELIHOODS ) printLikelihoods(numChr, logYMatrix, log10AlleleFrequencyPriors); -// } -// -// private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { -// int j = logYMatrix.length - 1; -// System.out.printf("-----------------------------------%n"); -// for (int k=0; k <= numChr; k++) { -// double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; -// System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); -// } -// } -// -// double softMax(double[] vec) { -// // compute naively log10(10^x[0] + 10^x[1]+...) -// // return Math.log10(MathUtils.sumLog10(vec)); -// -// // better approximation: do Jacobian logarithm function on data pairs -// double a = softMaxPair(vec[0],vec[1]); -// return softMaxPair(a,vec[2]); -// } -// -// static public double softMaxPair(double x, double y) { -// if (Double.isInfinite(x)) -// return y; -// -// if (Double.isInfinite(y)) -// return x; -// -// if (y >= x + MAX_JACOBIAN_TOLERANCE) -// return y; -// if (x >= y + MAX_JACOBIAN_TOLERANCE) -// return x; -// -// // OK, so |y-x| < tol: we use the following identity then: -// // we need to compute log10(10^x + 10^y) -// // By Jacobian logarithm identity, this is equal to -// // max(x,y) + log10(1+10^-abs(x-y)) -// // we compute the second term as a table lookup -// // with integer quantization -// double diff = Math.abs(x-y); -// double t1 =x; -// if (y > x) -// t1 = y; -// // t has max(x,y) -// // we have pre-stored correction for 0,0.1,0.2,... 10.0 -// int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); -// double t2 = jacobianLogTable[ind]; -// -// // gdebug+ -// //double z =Math.log10(1+Math.pow(10.0,-diff)); -// //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); -// //gdebug- -// return t1+t2; -// // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); -// } -// -// -// -// /** -// * Can be overridden by concrete subclasses -// * @param vc variant context with genotype likelihoods -// * @param log10AlleleFrequencyPosteriors allele frequency results -// * @param AFofMaxLikelihood allele frequency of max likelihood -// * -// * @return calls -// */ -// public Map assignGenotypes(VariantContext vc, -// double[] log10AlleleFrequencyPosteriors, -// int AFofMaxLikelihood) { -// if ( !vc.isVariant() ) -// throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); -// -// Allele refAllele = vc.getReference(); -// Allele altAllele = vc.getAlternateAllele(0); -// -// Map GLs = vc.getGenotypes(); -// double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; -// int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1]; -// -// ArrayList sampleIndices = new ArrayList(); -// int sampleIdx = 0; -// -// // todo - optimize initialization -// for (int k=0; k <= AFofMaxLikelihood; k++) -// for (int j=0; j <= GLs.size(); j++) -// pathMetricArray[j][k] = -1e30; -// -// pathMetricArray[0][0] = 0.0; -// -// if (SIMPLE_GREEDY_GENOTYPER) { -// sampleIndices.addAll(GLs.keySet()); -// sampleIdx = GLs.size(); -// } -// else { -// -// for ( Map.Entry sample : GLs.entrySet() ) { -// if ( !sample.getValue().hasLikelihoods() ) -// continue; -// -// double[] likelihoods = sample.getValue().getLikelihoods().getAsVector(); -// sampleIndices.add(sample.getKey()); -// -// for (int k=0; k <= AFofMaxLikelihood; k++) { -// -// double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0]; -// int bestIndex = k; -// -// if (k>0) { -// double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1]; -// if (m2 > bestMetric) { -// bestMetric = m2; -// bestIndex = k-1; -// } -// } -// -// if (k>1) { -// double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2]; -// if (m2 > bestMetric) { -// bestMetric = m2; -// bestIndex = k-2; -// } -// } -// -// pathMetricArray[sampleIdx+1][k] = bestMetric; -// tracebackArray[sampleIdx+1][k] = bestIndex; -// } -// sampleIdx++; -// } -// } -// -// HashMap calls = new HashMap(); -// -// int startIdx = AFofMaxLikelihood; -// for (int k = sampleIdx; k > 0; k--) { -// int bestGTguess; -// String sample = sampleIndices.get(k-1); -// Genotype g = GLs.get(sample); -// if ( !g.hasLikelihoods() ) -// continue; -// -// if (SIMPLE_GREEDY_GENOTYPER) -// bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); -// else { -// int newIdx = tracebackArray[k][startIdx]; -// bestGTguess = startIdx - newIdx; -// startIdx = newIdx; -// } -// -// ArrayList myAlleles = new ArrayList(); -// -// double qual; -// double[] likelihoods = g.getLikelihoods().getAsVector(); -// -// if (bestGTguess == 0) { -// myAlleles.add(refAllele); -// myAlleles.add(refAllele); -// qual = likelihoods[0] - Math.max(likelihoods[1], likelihoods[2]); -// } else if(bestGTguess == 1) { -// myAlleles.add(refAllele); -// myAlleles.add(altAllele); -// qual = likelihoods[1] - Math.max(likelihoods[0], likelihoods[2]); -// -// } else { -// myAlleles.add(altAllele); -// myAlleles.add(altAllele); -// qual = likelihoods[2] - Math.max(likelihoods[1], likelihoods[0]); -// } -// -// -// if (qual < 0) { -// // QUAL can be negative if the chosen genotype is not the most likely one individually. -// // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on -// double[] normalized = MathUtils.normalizeFromLog10(likelihoods); -// double chosenGenotype = normalized[bestGTguess]; -// qual = -1.0 * Math.log10(1.0 - chosenGenotype); -// } -// -// calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); -// -// } -// -// return calls; -// } -// -//} \ No newline at end of file + double[][] logYMatrix = new double[1+numSamples][1+numChr]; + + for (int i=0; i <=numSamples; i++) + for (int j=0; j <=numChr; j++) + logYMatrix[i][j] = Double.NEGATIVE_INFINITY; + + //YMatrix[0][0] = 1.0; + logYMatrix[0][0] = 0.0; + int j=0; + + for ( Map.Entry sample : GLs.entrySet() ) { + j++; + + if ( !sample.getValue().hasLikelihoods() ) + continue; + + //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); + double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector(); + //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); + double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; + + // special treatment for k=0: iteration reduces to: + //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; + logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; + + for (int k=1; k <= 2*j; k++ ) { + + //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; + double logNumerator[]; + logNumerator = new double[3]; + if (k < 2*j-1) + logNumerator[0] = log10Cache[2*j-k] + log10Cache[2*j-k-1] + logYMatrix[j-1][k] + + genotypeLikelihoods[GenotypeType.AA.ordinal()]; + else + logNumerator[0] = Double.NEGATIVE_INFINITY; + + + if (k < 2*j) + logNumerator[1] = log10Cache[2*k] + log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + + genotypeLikelihoods[GenotypeType.AB.ordinal()]; + else + logNumerator[1] = Double.NEGATIVE_INFINITY; + + if (k > 1) + logNumerator[2] = log10Cache[k] + log10Cache[k-1] + logYMatrix[j-1][k-2] + + genotypeLikelihoods[GenotypeType.BB.ordinal()]; + else + logNumerator[2] = Double.NEGATIVE_INFINITY; + + double logNum = softMax(logNumerator); + + //YMatrix[j][k] = num/den; + logYMatrix[j][k] = logNum - logDenominator; + } + + } + + for (int k=0; k <= numChr; k++) + log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; + + return numChr; + } + + private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { + int j = logYMatrix.length - 1; + System.out.printf("-----------------------------------%n"); + for (int k=0; k <= numChr; k++) { + double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; + System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); + } + } + + double softMax(double[] vec) { + // compute naively log10(10^x[0] + 10^x[1]+...) + // return Math.log10(MathUtils.sumLog10(vec)); + + // better approximation: do Jacobian logarithm function on data pairs + double a = softMaxPair(vec[0],vec[1]); + return softMaxPair(a,vec[2]); + } + + static public double softMaxPair(double x, double y) { + if (Double.isInfinite(x)) + return y; + + if (Double.isInfinite(y)) + return x; + + if (y >= x + MAX_JACOBIAN_TOLERANCE) + return y; + if (x >= y + MAX_JACOBIAN_TOLERANCE) + return x; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + double diff = Math.abs(x-y); + double t1 =x; + if (y > x) + t1 = y; + // t has max(x,y) + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + int ind = (int)Math.round(diff/JACOBIAN_LOG_TABLE_STEP); + double t2 = jacobianLogTable[ind]; + + // gdebug+ + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + //gdebug- + return t1+t2; + // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); + } +}