More efficient implementation of the sum of the allele frequency posteriors matrix using a pre-allocated cache as discussed in group meeting last week. Now, when the cache is filled, we safely collapse down to a single value in real space and put the un-re-centered log10 value back into the front of the cache. Thanks to all for the help and advice.

This commit is contained in:
Eric Banks 2012-04-09 11:46:16 -04:00
parent 87e6bea6c1
commit 6ddf2170b6
4 changed files with 34 additions and 48 deletions

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
@ -42,12 +41,13 @@ public class AlleleFrequencyCalculationResult {
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
final private int[] alleleCountsOfMLE;
final private int[] alleleCountsOfMAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
// TODO -- better implementation needed here (see below)
private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
private static final int POSTERIORS_CACHE_SIZE = 5000;
private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
private int currentPosteriorsCacheIndex = 0;
private Double log10PosteriorMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
@ -69,14 +69,9 @@ public class AlleleFrequencyCalculationResult {
return log10MAP;
}
public double getLog10PosteriorMatrixSum() {
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
// TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
// TODO -- will discuss with the team what the best option is
final double[] tmp = new double[log10PosteriorMatrixValues.size()];
for ( int i = 0; i < tmp.length; i++ )
tmp[i] = log10PosteriorMatrixValues.get(i);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
}
return log10PosteriorMatrixSum;
}
@ -103,7 +98,7 @@ public class AlleleFrequencyCalculationResult {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
log10PosteriorMatrixValues.clear();
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
}
@ -116,7 +111,8 @@ public class AlleleFrequencyCalculationResult {
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
log10PosteriorMatrixValues.add(log10LofK);
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
@ -124,6 +120,18 @@ public class AlleleFrequencyCalculationResult {
}
}
private void addToPosteriorsCache(final double log10LofK) {
// add to the cache
log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
log10PosteriorMatrixValues[0] = temporarySum;
currentPosteriorsCacheIndex = 1;
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {

View File

@ -326,7 +326,7 @@ public class UnifiedGenotyperEngine {
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
final double sum = AFresult.getLog10PosteriorMatrixSum();
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
@ -369,7 +369,7 @@ public class UnifiedGenotyperEngine {
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
@ -380,7 +380,7 @@ public class UnifiedGenotyperEngine {
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
@ -389,7 +389,7 @@ public class UnifiedGenotyperEngine {
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -424,7 +424,7 @@ public class UnifiedGenotyperEngine {
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
}

View File

@ -237,7 +237,7 @@ public class MathUtils {
public static double log10sumLog10(double[] log10p, int start, int finish) {
double sum = 0.0;
double maxValue = Utils.findMaxEntry(log10p);
double maxValue = arrayMax(log10p, finish);
if(maxValue == Double.NEGATIVE_INFINITY)
return sum;
@ -554,7 +554,7 @@ public class MathUtils {
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = Utils.findMaxEntry(array);
double maxValue = arrayMax(array);
// we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
@ -627,10 +627,14 @@ public class MathUtils {
return maxI;
}
public static double arrayMax(double[] array) {
public static double arrayMax(final double[] array) {
return array[maxElementIndex(array)];
}
public static double arrayMax(final double[] array, final int endIndex) {
return array[maxElementIndex(array, endIndex)];
}
public static double arrayMin(double[] array) {
return array[minElementIndex(array)];
}

View File

@ -290,32 +290,6 @@ public class Utils {
return m;
}
// returns the maximum value in the array
public static double findMaxEntry(double[] array) {
return findIndexAndMaxEntry(array).first;
}
// returns the index of the maximum value in the array
public static int findIndexOfMaxEntry(double[] array) {
return findIndexAndMaxEntry(array).second;
}
// returns the the maximum value and its index in the array
private static Pair<Double, Integer> findIndexAndMaxEntry(double[] array) {
if ( array.length == 0 )
return new Pair<Double, Integer>(0.0, -1);
int index = 0;
double max = array[0];
for (int i = 1; i < array.length; i++) {
if ( array[i] > max ) {
max = array[i];
index = i;
}
}
return new Pair<Double, Integer>(max, index);
}
/**
* Splits expressions in command args by spaces and returns the array of expressions.
* Expressions may use single or double quotes to group any individual expression, but not both.