diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java index abfddab82..21eb94e15 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java @@ -144,30 +144,31 @@ public final class AlleleFrequencyCalculator extends AFCalculator { } final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy(); final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles); - final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + + final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); //the total probability - log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]); + log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX]; // per allele non-log space probabilities of zero counts for this sample // for each allele calculate the total probability of genotypes containing at least one copy of the allele - final double[] pOfNonZeroAltAlleles = new double[numAlleles]; + final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles]; + Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY); for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) { - final double genotypePosterior = genotypePosteriors[genotype]; + final double log10GenotypePosterior = log10GenotypePosteriors[genotype]; glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) -> - pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior); + log10ProbabilityOfNonZeroAltAlleles[alleleIndex] = + MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior)); } - // Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to - // rounding error would result in NaN. - // As every allele is present in at least one genotype, the p-non-zero-count for - // any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype - // does not contain this allele. - final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors); - for (int allele = 0; allele < numAlleles; allele++) { - log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele])); + // if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN + if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) { + log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY; + } else { + log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]); + } } } @@ -183,37 +184,42 @@ public final class AlleleFrequencyCalculator extends AFCalculator { // we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we // give it our posterior as its "likelihood" along with a flat dummy prior final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult - final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))}; + final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)}; return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele); } + // effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g] + // for numerical stability we will do this in log space: + // count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g) + // thanks to the log-sum-exp trick this lets us work with log posteriors alone private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) { final int numAlleles = vc.getNAlleles(); Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent"); - final double[] result = new double[numAlleles]; + final double[] log10Result = new double[numAlleles]; + Arrays.fill(log10Result, Double.NEGATIVE_INFINITY); for (final Genotype g : vc.getGenotypes()) { if (!g.hasLikelihoods()) { continue; } final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles); - final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); + final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> - result[alleleIndex] += genotypePosteriors[genotypeIndex] * count)); + log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count)))); } - return result; + return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x)); } - private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { + private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { final double[] log10Likelihoods = g.getLikelihoods().getAsVector(); - final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { + final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex); return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex] + gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]); }); - return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods); + return MathUtils.normalizeFromLog10(log10Posteriors, true); } @Override //Note: unused diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java index 27cb6a6aa..009d4eaae 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java @@ -330,6 +330,10 @@ public class MathUtils { return log10sumLog10(log10values, 0); } + public static double log10SumLog10(final double a, final double b) { + return a > b ? a + Math.log10(1 + Math.pow(10.0, b - a)) : b + Math.log10(1 + Math.pow(10.0, a - b)); + } + public static boolean wellFormedDouble(final double val) { return !Double.isInfinite(val) && !Double.isNaN(val); }