backport numerics changes in new qual
This commit is contained in:
parent
cc91052e69
commit
029632eb1c
|
|
@ -144,30 +144,31 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
|
|||
}
|
||||
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
|
||||
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, numAlleles);
|
||||
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
|
||||
|
||||
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
|
||||
|
||||
//the total probability
|
||||
log10PNoVariant += Math.log10(genotypePosteriors[HOM_REF_GENOTYPE_INDEX]);
|
||||
log10PNoVariant += log10GenotypePosteriors[HOM_REF_GENOTYPE_INDEX];
|
||||
|
||||
// per allele non-log space probabilities of zero counts for this sample
|
||||
// for each allele calculate the total probability of genotypes containing at least one copy of the allele
|
||||
final double[] pOfNonZeroAltAlleles = new double[numAlleles];
|
||||
final double[] log10ProbabilityOfNonZeroAltAlleles = new double[numAlleles];
|
||||
Arrays.fill(log10ProbabilityOfNonZeroAltAlleles, Double.NEGATIVE_INFINITY);
|
||||
|
||||
for (int genotype = 0; genotype < glCalc.genotypeCount(); genotype++) {
|
||||
final double genotypePosterior = genotypePosteriors[genotype];
|
||||
final double log10GenotypePosterior = log10GenotypePosteriors[genotype];
|
||||
glCalc.genotypeAlleleCountsAt(genotype).forEachAlleleIndexAndCount((alleleIndex, count) ->
|
||||
pOfNonZeroAltAlleles[alleleIndex] += genotypePosterior);
|
||||
log10ProbabilityOfNonZeroAltAlleles[alleleIndex] =
|
||||
MathUtils.log10SumLog10(log10ProbabilityOfNonZeroAltAlleles[alleleIndex], log10GenotypePosterior));
|
||||
}
|
||||
|
||||
// Make sure that we handle appropriately pOfNonZeroAltAlleles that are close to 1; values just over 1.0 due to
|
||||
// rounding error would result in NaN.
|
||||
// As every allele is present in at least one genotype, the p-non-zero-count for
|
||||
// any allele is bound above by 1.0 - minimum genotype posterior because at least one genotype
|
||||
// does not contain this allele.
|
||||
final double maximumPNonZeroCount = 1.0 - MathUtils.arrayMin(genotypePosteriors);
|
||||
|
||||
for (int allele = 0; allele < numAlleles; allele++) {
|
||||
log10POfZeroCountsByAllele[allele] += Math.log10(1.0 - Math.min(maximumPNonZeroCount, pOfNonZeroAltAlleles[allele]));
|
||||
// if prob of non hom ref == 1 up to numerical precision, short-circuit to avoid NaN
|
||||
if (log10ProbabilityOfNonZeroAltAlleles[allele] >= 0) {
|
||||
log10POfZeroCountsByAllele[allele] = Double.NEGATIVE_INFINITY;
|
||||
} else {
|
||||
log10POfZeroCountsByAllele[allele] += MathUtils.log10OneMinusPow10(log10ProbabilityOfNonZeroAltAlleles[allele]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -183,37 +184,42 @@ public final class AlleleFrequencyCalculator extends AFCalculator {
|
|||
// we compute posteriors here and don't have the same prior that AFCalculationResult expects. Therefore, we
|
||||
// give it our posterior as its "likelihood" along with a flat dummy prior
|
||||
final double[] dummyFlatPrior = {-1e-10, -1e-10}; //TODO: HACK must be negative for AFCalcResult
|
||||
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, Math.log10(1 - Math.pow(10, log10PNoVariant))};
|
||||
final double[] log10PosteriorOfNoVariantYesVariant = {log10PNoVariant, MathUtils.log10OneMinusPow10(log10PNoVariant)};
|
||||
|
||||
return new AFCalculationResult(integerAltAlleleCounts, DUMMY_N_EVALUATIONS, alleles, log10PosteriorOfNoVariantYesVariant, dummyFlatPrior, log10PRefByAllele);
|
||||
}
|
||||
|
||||
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
|
||||
// for numerical stability we will do this in log space:
|
||||
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
|
||||
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
|
||||
private double[] effectiveAlleleCounts(final VariantContext vc, final double[] log10AlleleFrequencies) {
|
||||
final int numAlleles = vc.getNAlleles();
|
||||
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
|
||||
final double[] result = new double[numAlleles];
|
||||
final double[] log10Result = new double[numAlleles];
|
||||
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
|
||||
for (final Genotype g : vc.getGenotypes()) {
|
||||
if (!g.hasLikelihoods()) {
|
||||
continue;
|
||||
}
|
||||
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(g.getPloidy(), numAlleles);
|
||||
final double[] genotypePosteriors = normalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
|
||||
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, glCalc, log10AlleleFrequencies);
|
||||
|
||||
new IndexRange(0, glCalc.genotypeCount()).forEach(genotypeIndex ->
|
||||
glCalc.genotypeAlleleCountsAt(genotypeIndex).forEachAlleleIndexAndCount((alleleIndex, count) ->
|
||||
result[alleleIndex] += genotypePosteriors[genotypeIndex] * count));
|
||||
log10Result[alleleIndex] = MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[genotypeIndex] + MathUtils.Log10Cache.get(count))));
|
||||
}
|
||||
return result;
|
||||
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
|
||||
}
|
||||
|
||||
private static double[] normalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
|
||||
private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final GenotypeLikelihoodCalculator glCalc, final double[] log10AlleleFrequencies) {
|
||||
final double[] log10Likelihoods = g.getLikelihoods().getAsVector();
|
||||
final double[] unnormalizedLog10Likelihoods = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
|
||||
final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
|
||||
final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(genotypeIndex);
|
||||
return gac.log10CombinationCount() + log10Likelihoods[genotypeIndex]
|
||||
+ gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]);
|
||||
});
|
||||
return MathUtils.normalizeFromLog10(unnormalizedLog10Likelihoods);
|
||||
return MathUtils.normalizeFromLog10(log10Posteriors, true);
|
||||
}
|
||||
|
||||
@Override //Note: unused
|
||||
|
|
|
|||
|
|
@ -330,6 +330,10 @@ public class MathUtils {
|
|||
return log10sumLog10(log10values, 0);
|
||||
}
|
||||
|
||||
public static double log10SumLog10(final double a, final double b) {
|
||||
return a > b ? a + Math.log10(1 + Math.pow(10.0, b - a)) : b + Math.log10(1 + Math.pow(10.0, a - b));
|
||||
}
|
||||
|
||||
public static boolean wellFormedDouble(final double val) {
|
||||
return !Double.isInfinite(val) && !Double.isNaN(val);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue