Swapping the iteration order to run over AF conformations and then samples instead of the reverse minimizes calls to HashMap.get; instead of it being O(n) since we called it for each sample it's now O(1). Runtime on T2D GENES test set is reduced by 5-10%. More optimizations to follow.
This commit is contained in:
parent
34cf2fe43b
commit
410a340ef5
|
|
@ -35,7 +35,7 @@ import java.util.*;
|
||||||
|
|
||||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
private final static boolean DEBUG = false;
|
// private final static boolean DEBUG = false;
|
||||||
|
|
||||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||||
|
|
||||||
|
|
@ -199,8 +199,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result) {
|
final AlleleFrequencyCalculationResult result) {
|
||||||
|
|
||||||
if ( DEBUG )
|
//if ( DEBUG )
|
||||||
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||||
|
|
||||||
// compute the log10Likelihoods
|
// compute the log10Likelihoods
|
||||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result);
|
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||||
|
|
@ -209,8 +209,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
if ( !preserveData ) {
|
if ( !preserveData ) {
|
||||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
||||||
indexesToACset.put(index, null);
|
indexesToACset.put(index, null);
|
||||||
if ( DEBUG )
|
//if ( DEBUG )
|
||||||
System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -218,8 +218,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
// can we abort early because the log10Likelihoods are so small?
|
// can we abort early because the log10Likelihoods are so small?
|
||||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||||
if ( DEBUG )
|
//if ( DEBUG )
|
||||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||||
|
|
||||||
// no reason to keep this data around because nothing depends on it
|
// no reason to keep this data around because nothing depends on it
|
||||||
if ( !preserveData )
|
if ( !preserveData )
|
||||||
|
|
@ -262,8 +262,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
|
// if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate
|
||||||
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
|
// over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early)
|
||||||
if ( !preserveData && lastSet == null ) {
|
if ( !preserveData && lastSet == null ) {
|
||||||
if ( DEBUG )
|
//if ( DEBUG )
|
||||||
System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
|
// System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts);
|
||||||
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
||||||
}
|
}
|
||||||
if ( lastSet != null )
|
if ( lastSet != null )
|
||||||
|
|
@ -323,36 +323,43 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
else {
|
else {
|
||||||
// all possible likelihoods for a given cell from which to choose the max
|
// all possible likelihoods for a given cell from which to choose the max
|
||||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
||||||
final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it
|
final double[][] log10ConformationLikelihoods = new double[set.log10Likelihoods.length][numPaths]; // TODO can be created just once, since you initialize it
|
||||||
|
// initialize
|
||||||
|
for ( int i = 0; i < set.log10Likelihoods.length; i++ )
|
||||||
|
for ( int j = 0; j < numPaths; j++ )
|
||||||
|
// TODO -- Arrays.fill?
|
||||||
|
// todo -- is this even necessary? Why not have as else below?
|
||||||
|
log10ConformationLikelihoods[i][j] = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
// deal with the non-AA possible conformations
|
||||||
final double[] gl = genotypeLikelihoods.get(j);
|
int conformationIndex = 1;
|
||||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
||||||
|
//if ( DEBUG )
|
||||||
|
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
||||||
|
|
||||||
// initialize
|
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
||||||
for ( int i = 0; i < numPaths; i++ )
|
|
||||||
// TODO -- Arrays.fill?
|
|
||||||
// todo -- is this even necessary? Why not have as else below?
|
|
||||||
log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
// deal with the AA case first
|
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||||
if ( totalK < 2*j-1 )
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
|
||||||
|
|
||||||
// deal with the other possible conformations now
|
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
log10ConformationLikelihoods[j][conformationIndex] =
|
||||||
int conformationIndex = 1;
|
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
||||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
|
||||||
if ( DEBUG )
|
|
||||||
System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
|
||||||
log10ConformationLikelihoods[conformationIndex++] =
|
|
||||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final double log10Max = MathUtils.approximateLog10SumLog10(log10ConformationLikelihoods);
|
conformationIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
// finally, update the L(j,k) value
|
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||||
|
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||||
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
|
|
||||||
|
if ( totalK < 2*j-1 )
|
||||||
|
log10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||||
|
|
||||||
|
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||||
|
final double log10Max = MathUtils.approximateLog10SumLog10(log10ConformationLikelihoods[j]);
|
||||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
set.log10Likelihoods[j] = log10Max - logDenominator;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -523,7 +530,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
lastK = k;
|
lastK = k;
|
||||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||||
if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||||
done = true;
|
done = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue