Remove N2 EXACT model code, which should never be used
This commit is contained in:
parent
27ce3c822e
commit
e3d4efb283
|
|
@ -48,27 +48,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// code for testing purposes
|
||||
//
|
||||
private final static boolean DEBUG = false;
|
||||
private final static boolean PRINT_LIKELIHOODS = false;
|
||||
private final static int N_CYCLES = 1;
|
||||
private SimpleTimer timerExpt = new SimpleTimer("linearExactBanded");
|
||||
private SimpleTimer timerGS = new SimpleTimer("linearExactGS");
|
||||
private final static boolean COMPARE_TO_GS = false;
|
||||
|
||||
public enum ExactCalculation {
|
||||
N2_GOLD_STANDARD,
|
||||
LINEAR_EXPERIMENTAL
|
||||
}
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
private boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
|
||||
private final boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
|
||||
|
||||
final private ExactCalculation calcToUse;
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
calcToUse = UAC.EXACT_CALCULATION_TYPE;
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
|
|
@ -76,43 +61,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
Map<String, Genotype> GLs, Set<Allele>alleles,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
double[] gsPosteriors;
|
||||
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
|
||||
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
|
||||
|
||||
int idxAA = GenotypeType.AA.ordinal();
|
||||
int idxAB = GenotypeType.AB.ordinal();
|
||||
int idxBB = GenotypeType.BB.ordinal();
|
||||
|
||||
// todo -- remove me after testing
|
||||
if ( N_CYCLES > 1 ) {
|
||||
for ( int i = 0; i < N_CYCLES; i++) {
|
||||
timerGS.restart();
|
||||
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB);
|
||||
timerGS.stop();
|
||||
|
||||
timerExpt.restart();
|
||||
linearExactBanded(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
|
||||
timerExpt.stop();
|
||||
}
|
||||
|
||||
System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n",
|
||||
timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime());
|
||||
}
|
||||
|
||||
int lastK = -1;
|
||||
|
||||
int numAlleles = alleles.size();
|
||||
final int numAlleles = alleles.size();
|
||||
final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null;
|
||||
final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null;
|
||||
|
||||
int idxDiag = numAlleles;
|
||||
int incr = numAlleles - 1;
|
||||
|
||||
double[][] posteriorCache = new double[numAlleles-1][];
|
||||
double[] bestAFguess = new double[numAlleles-1];
|
||||
|
||||
for (int k=1; k < numAlleles; k++) {
|
||||
// multi-allelic approximation, part 1: Ideally
|
||||
// for each alt allele compute marginal (suboptimal) posteriors -
|
||||
|
|
@ -121,24 +75,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
|
||||
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
|
||||
|
||||
idxAA = 0;
|
||||
idxAB = k;
|
||||
final int idxAA = 0;
|
||||
final int idxAB = k;
|
||||
// yy is always element on the diagonal.
|
||||
// 2 alleles: BBelement 2
|
||||
// 3 alleles: BB element 3. CC element 5
|
||||
// 4 alleles:
|
||||
idxBB = idxDiag;
|
||||
final int idxBB = idxDiag;
|
||||
idxDiag += incr--;
|
||||
|
||||
// todo - possible cleanup
|
||||
switch ( calcToUse ) {
|
||||
case N2_GOLD_STANDARD:
|
||||
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
break;
|
||||
case LINEAR_EXPERIMENTAL:
|
||||
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
break;
|
||||
}
|
||||
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
|
||||
|
||||
if (numAlleles > 2) {
|
||||
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
|
||||
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
|
||||
|
|
@ -153,39 +100,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
|
||||
|
||||
}
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
// todo -- REMOVE ME AFTER TESTING
|
||||
if ( COMPARE_TO_GS ) {
|
||||
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB);
|
||||
|
||||
double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
|
||||
double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
|
||||
boolean eq = (log10thisPVar == Double.NEGATIVE_INFINITY && log10gsPVar == Double.NEGATIVE_INFINITY) || MathUtils.compareDoubles(log10thisPVar, log10gsPVar, 1e-4) == 0;
|
||||
|
||||
if ( ! eq || PRINT_LIKELIHOODS ) {
|
||||
System.out.printf("----------------------------------------%n");
|
||||
for (int k=0; k < log10AlleleFrequencyPosteriors.length; k++) {
|
||||
double x = log10AlleleFrequencyPosteriors[k];
|
||||
System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k,
|
||||
x < -1e10 ? Double.NEGATIVE_INFINITY : x, gsPosteriors[k],
|
||||
log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]);
|
||||
}
|
||||
System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n",
|
||||
ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(Map<String, Genotype> GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>();
|
||||
|
||||
//int j = 0;
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.values() ) {
|
||||
if ( sample.hasLikelihoods() ) {
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
double[] gls = sample.getLikelihoods().getAsVector();
|
||||
|
||||
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
|
||||
|
|
@ -240,84 +162,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
// now with banding
|
||||
public int linearExactBanded(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
throw new NotImplementedException();
|
||||
// final int numSamples = GLs.size();
|
||||
// final int numChr = 2*numSamples;
|
||||
// final double[][] genotypeLikelihoods = getGLs(GLs);
|
||||
//
|
||||
// final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
// logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
//
|
||||
// double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
// boolean done = false;
|
||||
// int lastK = -1;
|
||||
// final int BAND_SIZE = 10;
|
||||
//
|
||||
// for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
// final double[] kMinus0 = logY.getkMinus0();
|
||||
// int jStart = Math.max(k - BAND_SIZE, 1);
|
||||
// int jStop = Math.min(k + BAND_SIZE, numSamples);
|
||||
//
|
||||
// if ( k == 0 ) { // special case for k = 0
|
||||
// for ( int j=1; j <= numSamples; j++ ) {
|
||||
// kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
|
||||
// }
|
||||
// } else { // k > 0
|
||||
// final double[] kMinus1 = logY.getkMinus1();
|
||||
// final double[] kMinus2 = logY.getkMinus2();
|
||||
// Arrays.fill(kMinus0,0);
|
||||
//
|
||||
// for ( int j = jStart; j <= jStop; j++ ) {
|
||||
// final double[] gl = genotypeLikelihoods[j];
|
||||
// final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1];
|
||||
//
|
||||
// double aa = Double.NEGATIVE_INFINITY;
|
||||
// double ab = Double.NEGATIVE_INFINITY;
|
||||
// if (k < 2*j-1)
|
||||
// aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
|
||||
//
|
||||
// if (k < 2*j)
|
||||
// ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
|
||||
//
|
||||
// double log10Max;
|
||||
// if (k > 1) {
|
||||
// final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
|
||||
// log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
// } else {
|
||||
// // we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
// log10Max = approximateLog10SumLog10(aa, ab);
|
||||
// }
|
||||
//
|
||||
// // finally, update the L(j,k) value
|
||||
// kMinus0[j] = log10Max - logDenominator;
|
||||
//
|
||||
// String offset = Utils.dupString(' ',k);
|
||||
// System.out.printf("%s%3d %3d %.2f%n", offset, k, j, kMinus0[j]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // update the posteriors vector
|
||||
// final double log10LofK = kMinus0[jStop];
|
||||
// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
//
|
||||
// // can we abort early?
|
||||
// lastK = k;
|
||||
// maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
// if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
// done = true;
|
||||
// }
|
||||
//
|
||||
// logY.rotate();
|
||||
// }
|
||||
//
|
||||
// return lastK;
|
||||
}
|
||||
|
||||
public int linearExact(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
|
|
@ -605,82 +449,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
return calls;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Gold standard, but O(N^2), implementation.
|
||||
//
|
||||
// TODO -- remove me for clarity in this code
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
|
||||
int numSamples = GLs.size();
|
||||
int numChr = 2*numSamples;
|
||||
|
||||
double[][] logYMatrix = new double[1+numSamples][1+numChr];
|
||||
|
||||
for (int i=0; i <=numSamples; i++)
|
||||
for (int j=0; j <=numChr; j++)
|
||||
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
//YMatrix[0][0] = 1.0;
|
||||
logYMatrix[0][0] = 0.0;
|
||||
int j=0;
|
||||
|
||||
for ( Map.Entry<String, Genotype> sample : GLs.entrySet() ) {
|
||||
j++;
|
||||
|
||||
if ( !sample.getValue().hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
//double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods());
|
||||
double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector();
|
||||
//double logDenominator = Math.log10(2.0*j*(2.0*j-1));
|
||||
double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
// special treatment for k=0: iteration reduces to:
|
||||
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
|
||||
|
||||
for (int k=1; k <= 2*j; k++ ) {
|
||||
|
||||
//double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
double logNumerator[];
|
||||
logNumerator = new double[3];
|
||||
if (k < 2*j-1)
|
||||
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
|
||||
genotypeLikelihoods[idxAA];
|
||||
else
|
||||
logNumerator[0] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
|
||||
if (k < 2*j)
|
||||
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
|
||||
genotypeLikelihoods[idxAB];
|
||||
else
|
||||
logNumerator[1] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if (k > 1)
|
||||
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
|
||||
genotypeLikelihoods[idxBB];
|
||||
else
|
||||
logNumerator[2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
double logNum = MathUtils.softMax(logNumerator);
|
||||
|
||||
//YMatrix[j][k] = num/den;
|
||||
logYMatrix[j][k] = logNum - logDenominator;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int k=0; k <= numChr; k++)
|
||||
log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k];
|
||||
|
||||
return numChr;
|
||||
}
|
||||
|
||||
private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) {
|
||||
int j = logYMatrix.length - 1;
|
||||
System.out.printf("-----------------------------------%n");
|
||||
|
|
@ -689,5 +457,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -168,10 +168,6 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
|
||||
public boolean GSA_PRODUCTION_ONLY = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
|
||||
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
public boolean IGNORE_SNP_ALLELES = false;
|
||||
|
|
@ -191,7 +187,6 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.AFmodel = AFmodel;
|
||||
uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.PCR_error = PCR_error;
|
||||
uac.GenotypingMode = GenotypingMode;
|
||||
|
|
|
|||
Loading…
Reference in New Issue