diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ed737064d..ecdcb4c26 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); return alleles; } @@ -189,24 +188,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // the column of the matrix final double[] log10Likelihoods; - // mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column; - // for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB. - final HashMap ACsetIndexToPLIndex = new HashMap(); - - // to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them - final ArrayList dependentACsetsToDelete = new ArrayList(); - + int sum = -1; public ExactACset(final int size, final ExactACcounts ACcounts) { this.ACcounts = ACcounts; log10Likelihoods = new double[size]; + Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); } // sum of all the non-reference alleles public int getACsum() { - int sum = 0; - for ( int count : ACcounts.getCounts() ) - sum += count; + if ( sum == -1 ) { + sum = 0; + for ( int count : ACcounts.getCounts() ) + sum += count; + } return sum; } @@ -215,11 +211,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + // TODO -- remove me public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result, - final boolean preserveData) { + final boolean foo) { + linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result); + } + + + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { // make sure the PL cache has been initialized if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) @@ -241,21 +247,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); - // optimization: create the temporary storage for computing L(j,k) just once - final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; - final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; - for ( int i = 0; i < maxPossibleDependencies; i++ ) - tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; - // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); + + // clean up memory + indexesToACset.remove(set.ACcounts); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } } @@ -273,27 +278,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final ArrayList genotypeLikelihoods, final double maxLog10L, final int numChr, - final boolean preserveData, final LinkedList ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final AlleleFrequencyCalculationResult result) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); - - // clean up memory - if ( !preserveData ) { - for ( ExactACcounts index : set.dependentACsetsToDelete ) { - indexesToACset.remove(index); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); - } - } + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; @@ -301,11 +295,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - - // no reason to keep this data around because nothing depends on it - if ( !preserveData ) - indexesToACset.remove(set.ACcounts); - return log10LofK; } @@ -324,7 +313,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for ( int allele = 0; allele < numAltAlleles; allele++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele]++; - updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -347,62 +336,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); - } - - // determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column - if ( !preserveData ) { - final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); - if ( lastSet != null ) - lastSet.dependentACsetsToDelete.add(set.ACcounts); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; } // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also adds it as a dependency to the given callingSetIndex. - // returns the ExactACset if that set was not already in the queue and null otherwise. - private static void updateACset(final int[] ACcounts, + // also pushes its value to the given callingSetIndex. + private static void updateACset(final int[] newSetCounts, final int numChr, - final ExactACset callingSet, + final ExactACset dependentSet, final int PLsetIndex, final Queue ACqueue, - final HashMap indexesToACset) { - final ExactACcounts index = new ExactACcounts(ACcounts); + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { ExactACset set = new ExactACset(numChr/2 +1, index); indexesToACset.put(index, set); ACqueue.add(set); } - // add the given dependency to the set + // push data from the dependency to the new set //if ( DEBUG ) - // System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts); - final ExactACset set = indexesToACset.get(index); - set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); - } - - private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList ACqueue) { - Iterator reverseIterator = ACqueue.descendingIterator(); - while ( reverseIterator.hasNext() ) { - final ExactACset queued = reverseIterator.next(); - if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) ) - return queued; - } - - // shouldn't get here - throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!"); + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); } private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -414,42 +381,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // k > 0 for at least one k else { - // deal with the non-AA possible conformations - int conformationIndex = 1; - for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { - //if ( DEBUG ) - // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); - - ExactACset dependent = indexesToACset.get(mapping.getKey()); - - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][conformationIndex] = - determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; - } else { - tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; - } - } - - conformationIndex++; - } - - // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - final int numPaths = set.ACsetIndexToPLIndex.size() + 1; + // the non-AA possible conformations were dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { if ( totalK < 2*j-1 ) { final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - } else { - tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); } final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths); - set.log10Likelihoods[j] = log10Max - logDenominator; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; } } @@ -478,6 +421,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + private static void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; + targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); + } + } + } + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { // the closed form representation generalized for multiple alleles is as follows: