Merge pull request #1502 from broadinstitute/db_port_qual_tweaking

Backport numerics changes in new qual
This commit is contained in:
Valentin Ruano Rubio 2016-11-01 10:26:26 -04:00 committed by GitHub
commit 40ae430a70
2 changed files with 31 additions and 21 deletions

View File

@ -144,30 +144,31 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
}
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
//the total probability
log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]);
log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
// per allele non-log space probabilities of zero counts for this sample
// for each allele calculate the total probability of genotypes containing at least one copy of the allele
final double[] pOfNonZeroAltAlleles = new double[numAlleles];
final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
final double genotypePosterior = genotypePosteriors[genotype];
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior);
log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
}
// Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to
// rounding error would result in NaN.
// As every allele is present in at least one genotype, the p-non-zero-count for
// any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype
// does not contain this allele.
final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors);
for (int allele = 0; allele < numAlleles; allele++) {
log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele]));
// if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
} else {
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
}
}
}
@ -183,37 +184,42 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
// we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we
// give it our posterior as its "likelihood" along with a flat dummy prior
final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))};
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)};
return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele);
}
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
final int numAlleles = vc.getNAlleles();
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] result = new double[numAlleles];
final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
for (final Genotype g : vc.getGenotypes()) {
if (!g.hasLikelihoods()) {
continue;
}
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
result[alleleIndex] += genotypePosteriors[genotypeIndex] * count));
log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count))));
}
return result;
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
final double[] log10Likelihoods = g.getLikelihoods().getAsVector();
final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex);
return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex]
+ gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]);
});
return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods);
return MathUtils.normalizeFromLog10(log10Posteriors, true);
}
@Override //Note: unused

View File

@ -330,6 +330,10 @@ public class MathUtils {
return log10sumLog10(log10values, 0);
}
public static double log10SumLog10(final double a, final double b) {
return a > b ? a + Math.log10(1 + Math.pow(10.0, b - a)) : b + Math.log10(1 + Math.pow(10.0, a - b));
}
public static boolean wellFormedDouble(final double val) {
return !Double.isInfinite(val) && !Double.isNaN(val);
}