Exact model memory optimization: instead of having a later matrix column pull in data from earlier ones (requiring us to keep them around until all dependencies are hit), the earlier columns push data into their dependents immediately and then are removed. This does trade off speed a little bit (because we need to call approximateLog10Sum each time we add to a dependent instead of once in an array at the end). Note that this commit would normally not get pushed into stable, but I'm about to make a very disruptive push into stable that would make merging this from unstable a nightmare.
This commit is contained in:
parent
bb2c10b785
commit
f7c2c818fe
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false);
|
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
return alleles;
|
return alleles;
|
||||||
}
|
}
|
||||||
|
|
@ -189,24 +188,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
// the column of the matrix
|
// the column of the matrix
|
||||||
final double[] log10Likelihoods;
|
final double[] log10Likelihoods;
|
||||||
|
|
||||||
// mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column;
|
int sum = -1;
|
||||||
// for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB.
|
|
||||||
final HashMap<ExactACcounts, Integer> ACsetIndexToPLIndex = new HashMap<ExactACcounts, Integer>();
|
|
||||||
|
|
||||||
// to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them
|
|
||||||
final ArrayList<ExactACcounts> dependentACsetsToDelete = new ArrayList<ExactACcounts>();
|
|
||||||
|
|
||||||
|
|
||||||
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
public ExactACset(final int size, final ExactACcounts ACcounts) {
|
||||||
this.ACcounts = ACcounts;
|
this.ACcounts = ACcounts;
|
||||||
log10Likelihoods = new double[size];
|
log10Likelihoods = new double[size];
|
||||||
|
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum of all the non-reference alleles
|
// sum of all the non-reference alleles
|
||||||
public int getACsum() {
|
public int getACsum() {
|
||||||
int sum = 0;
|
if ( sum == -1 ) {
|
||||||
for ( int count : ACcounts.getCounts() )
|
sum = 0;
|
||||||
sum += count;
|
for ( int count : ACcounts.getCounts() )
|
||||||
|
sum += count;
|
||||||
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -215,11 +211,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO -- remove me
|
||||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||||
final int numAlternateAlleles,
|
final int numAlternateAlleles,
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result,
|
||||||
final boolean preserveData) {
|
final boolean foo) {
|
||||||
|
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||||
|
final int numAlternateAlleles,
|
||||||
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
|
final AlleleFrequencyCalculationResult result) {
|
||||||
|
|
||||||
// make sure the PL cache has been initialized
|
// make sure the PL cache has been initialized
|
||||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
||||||
|
|
@ -241,21 +247,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
ACqueue.add(zeroSet);
|
ACqueue.add(zeroSet);
|
||||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||||
|
|
||||||
// optimization: create the temporary storage for computing L(j,k) just once
|
|
||||||
final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1;
|
|
||||||
final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies];
|
|
||||||
for ( int i = 0; i < maxPossibleDependencies; i++ )
|
|
||||||
tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
// keep processing while we have AC conformations that need to be calculated
|
// keep processing while we have AC conformations that need to be calculated
|
||||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||||
while ( !ACqueue.isEmpty() ) {
|
while ( !ACqueue.isEmpty() ) {
|
||||||
// compute log10Likelihoods
|
// compute log10Likelihoods
|
||||||
final ExactACset set = ACqueue.remove();
|
final ExactACset set = ACqueue.remove();
|
||||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
// adjust max likelihood seen if needed
|
// adjust max likelihood seen if needed
|
||||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||||
|
|
||||||
|
// clean up memory
|
||||||
|
indexesToACset.remove(set.ACcounts);
|
||||||
|
//if ( DEBUG )
|
||||||
|
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -273,27 +278,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
final ArrayList<double[]> genotypeLikelihoods,
|
final ArrayList<double[]> genotypeLikelihoods,
|
||||||
final double maxLog10L,
|
final double maxLog10L,
|
||||||
final int numChr,
|
final int numChr,
|
||||||
final boolean preserveData,
|
|
||||||
final LinkedList<ExactACset> ACqueue,
|
final LinkedList<ExactACset> ACqueue,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result) {
|
||||||
final double[][] tempLog10ConformationLikelihoods) {
|
|
||||||
|
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||||
|
|
||||||
// compute the log10Likelihoods
|
// compute the log10Likelihoods
|
||||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods);
|
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||||
|
|
||||||
// clean up memory
|
|
||||||
if ( !preserveData ) {
|
|
||||||
for ( ExactACcounts index : set.dependentACsetsToDelete ) {
|
|
||||||
indexesToACset.remove(index);
|
|
||||||
//if ( DEBUG )
|
|
||||||
// System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||||
|
|
||||||
|
|
@ -301,11 +295,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||||
|
|
||||||
// no reason to keep this data around because nothing depends on it
|
|
||||||
if ( !preserveData )
|
|
||||||
indexesToACset.remove(set.ACcounts);
|
|
||||||
|
|
||||||
return log10LofK;
|
return log10LofK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -324,7 +313,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||||
ACcountsClone[allele]++;
|
ACcountsClone[allele]++;
|
||||||
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset);
|
updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||||
|
|
@ -347,62 +336,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||||
for ( DependentSet dependent : differentAlleles )
|
for ( DependentSet dependent : differentAlleles )
|
||||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
for ( DependentSet dependent : sameAlleles )
|
for ( DependentSet dependent : sameAlleles )
|
||||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset);
|
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||||
}
|
|
||||||
|
|
||||||
// determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column
|
|
||||||
if ( !preserveData ) {
|
|
||||||
final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue);
|
|
||||||
if ( lastSet != null )
|
|
||||||
lastSet.dependentACsetsToDelete.add(set.ACcounts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return log10LofK;
|
return log10LofK;
|
||||||
}
|
}
|
||||||
|
|
||||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||||
// also adds it as a dependency to the given callingSetIndex.
|
// also pushes its value to the given callingSetIndex.
|
||||||
// returns the ExactACset if that set was not already in the queue and null otherwise.
|
private static void updateACset(final int[] newSetCounts,
|
||||||
private static void updateACset(final int[] ACcounts,
|
|
||||||
final int numChr,
|
final int numChr,
|
||||||
final ExactACset callingSet,
|
final ExactACset dependentSet,
|
||||||
final int PLsetIndex,
|
final int PLsetIndex,
|
||||||
final Queue<ExactACset> ACqueue,
|
final Queue<ExactACset> ACqueue,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||||
final ExactACcounts index = new ExactACcounts(ACcounts);
|
final ArrayList<double[]> genotypeLikelihoods) {
|
||||||
|
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||||
if ( !indexesToACset.containsKey(index) ) {
|
if ( !indexesToACset.containsKey(index) ) {
|
||||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||||
indexesToACset.put(index, set);
|
indexesToACset.put(index, set);
|
||||||
ACqueue.add(set);
|
ACqueue.add(set);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add the given dependency to the set
|
// push data from the dependency to the new set
|
||||||
//if ( DEBUG )
|
//if ( DEBUG )
|
||||||
// System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts);
|
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||||
final ExactACset set = indexesToACset.get(index);
|
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||||
set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList<ExactACset> ACqueue) {
|
|
||||||
Iterator<ExactACset> reverseIterator = ACqueue.descendingIterator();
|
|
||||||
while ( reverseIterator.hasNext() ) {
|
|
||||||
final ExactACset queued = reverseIterator.next();
|
|
||||||
if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) )
|
|
||||||
return queued;
|
|
||||||
}
|
|
||||||
|
|
||||||
// shouldn't get here
|
|
||||||
throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void computeLofK(final ExactACset set,
|
private static void computeLofK(final ExactACset set,
|
||||||
final ArrayList<double[]> genotypeLikelihoods,
|
final ArrayList<double[]> genotypeLikelihoods,
|
||||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
|
||||||
final double[][] log10AlleleFrequencyPriors,
|
final double[][] log10AlleleFrequencyPriors,
|
||||||
final AlleleFrequencyCalculationResult result,
|
final AlleleFrequencyCalculationResult result) {
|
||||||
final double[][] tempLog10ConformationLikelihoods) {
|
|
||||||
|
|
||||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||||
final int totalK = set.getACsum();
|
final int totalK = set.getACsum();
|
||||||
|
|
@ -414,42 +381,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
// k > 0 for at least one k
|
// k > 0 for at least one k
|
||||||
else {
|
else {
|
||||||
// deal with the non-AA possible conformations
|
// the non-AA possible conformations were dealt with by pushes from dependent sets;
|
||||||
int conformationIndex = 1;
|
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||||
for ( Map.Entry<ExactACcounts, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) {
|
|
||||||
//if ( DEBUG )
|
|
||||||
// System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey());
|
|
||||||
|
|
||||||
ExactACset dependent = indexesToACset.get(mapping.getKey());
|
|
||||||
|
|
||||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
|
||||||
|
|
||||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
|
||||||
final double[] gl = genotypeLikelihoods.get(j);
|
|
||||||
tempLog10ConformationLikelihoods[j][conformationIndex] =
|
|
||||||
determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()];
|
|
||||||
} else {
|
|
||||||
tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
conformationIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
|
||||||
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
|
|
||||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||||
|
|
||||||
if ( totalK < 2*j-1 ) {
|
if ( totalK < 2*j-1 ) {
|
||||||
final double[] gl = genotypeLikelihoods.get(j);
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||||
} else {
|
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||||
tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||||
final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths);
|
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||||
set.log10Likelihoods[j] = log10Max - logDenominator;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -478,6 +421,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void pushData(final ExactACset targetSet,
|
||||||
|
final ExactACset dependentSet,
|
||||||
|
final int PLsetIndex,
|
||||||
|
final ArrayList<double[]> genotypeLikelihoods) {
|
||||||
|
final int totalK = targetSet.getACsum();
|
||||||
|
|
||||||
|
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||||
|
|
||||||
|
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||||
|
final double[] gl = genotypeLikelihoods.get(j);
|
||||||
|
final double conformationValue =
|
||||||
|
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||||
|
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||||
|
|
||||||
// the closed form representation generalized for multiple alleles is as follows:
|
// the closed form representation generalized for multiple alleles is as follows:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue