Merge pull request #1502 from broadinstitute/db_port_qual_tweaking

Backport numerics changes in new qual
This commit is contained in:
Valentin Ruano Rubio 2016-11-01 10:26:26 -04:00 committed by GitHub
commit 40ae430a70
2 changed files with 31 additions and 21 deletions

View File

@ -144,30 +144,31 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
} }
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy(); final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles); final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
//the total probability //the total probability
log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]); log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
// per allele non-log space probabilities of zero counts for this sample // per allele non-log space probabilities of zero counts for this sample
// for each allele calculate the total probability of genotypes containing at least one copy of the allele // for each allele calculate the total probability of genotypes containing at least one copy of the allele
final double[] pOfNonZeroAltAlleles = new double[numAlleles]; final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) { for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double genotypePosterior = genotypePosteriors[genotype]; final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) -> glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior); log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
} }
// Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to
// rounding error would result in NaN.
// As every allele is present in at least one genotype, the p-non-zero-count for
// any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype
// does not contain this allele.
final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors);
for (int allele = 0; allele < numAlleles; allele++) { for (int allele = 0; allele < numAlleles; allele++) {
log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele])); // if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
} else {
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
}
} }
} }
@ -183,37 +184,42 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
// we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we // we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we
// give it our posterior as its "likelihood" along with a flat dummy prior // give it our posterior as its "likelihood" along with a flat dummy prior
final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))}; final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)};
return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele); return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele);
} }
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) { private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
final int numAlleles = vc.getNAlleles(); final int numAlleles = vc.getNAlleles();
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent"); Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] result = new double[numAlleles]; final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
for (final Genotype g : vc.getGenotypes()) { for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) { if (!g.hasLikelihoods()) {
continue; continue;
} }
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles); final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies); final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex -> new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) -> glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
result[alleleIndex] += genotypePosteriors[genotypeIndex] * count)); log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count))));
} }
return result; return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
} }
private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) { private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
final double[] log10Likelihoods = g.getLikelihoods().getAsVector(); final double[] log10Likelihoods = g.getLikelihoods().getAsVector();
final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> { final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex); final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex);
return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex] return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex]
+ gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]); + gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]);
}); });
return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods); return MathUtils.normalizeFromLog10(log10Posteriors, true);
} }
@Override //Note: unused @Override //Note: unused

View File

@ -330,6 +330,10 @@ public class MathUtils {
return log10sumLog10(log10values, 0); return log10sumLog10(log10values, 0);
} }
public static double log10SumLog10(final double a, final double b) {
return a > b ? a + Math.log10(1 + Math.pow(10.0, b - a)) : b + Math.log10(1 + Math.pow(10.0, a - b));
}
public static boolean wellFormedDouble(final double val) { public static boolean wellFormedDouble(final double val) {
return !Double.isInfinite(val) && !Double.isNaN(val); return !Double.isInfinite(val) && !Double.isNaN(val);
} }