Faster implementation of the active state profile value calculation when running HC with a single sample.

Find out about a dev-bug and added TODOs (reported in #1096).

Addresses issue #1095.

Conflicts:
	protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
This commit is contained in:
vruano 2015-07-29 16:52:11 -04:00
parent 9819624801
commit 604fb7aaf8
3 changed files with 57 additions and 3 deletions

View File

@ -504,7 +504,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
* @return never {@code null}, an array with exactly <code>total-ploidy(vc) + 1</code> positions.
*/
protected final double[] getAlleleFrequencyPriors( final VariantContext vc, final int defaultPloidy, final GenotypeLikelihoodsCalculationModel.Model model ) {
final int totalPloidy = GATKVariantContextUtils.totalPloidy(vc,defaultPloidy);
final int totalPloidy = GATKVariantContextUtils.totalPloidy(vc, defaultPloidy);
switch (model) {
case SNP:
case GENERALPLOIDYSNP:
@ -668,4 +668,50 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
return MLEfrequencies;
}
/**
* Calculates the active state profile value for a single sample.
*
* @param log10GenotypeLikelihoods the single sample genotype likelihoods.
* @return log10 probability from 0 to -Infinity.
*/
public double calculateSingleSampleRefVsAnyActiveStateProfileValue(final double[] log10GenotypeLikelihoods) {
if (log10GenotypeLikelihoods == null) {
throw new IllegalArgumentException("the input likelihoods cannot be null");
} else if (log10GenotypeLikelihoods.length != this.configuration.genotypeArgs.samplePloidy + 1) {
throw new IllegalArgumentException("wrong likelihoods dimensions");
} else {
final double[] log10Priors = log10AlleleFrequencyPriorsSNPs.forTotalPloidy(this.configuration.genotypeArgs.samplePloidy);
final double log10ACeq0Likelihood = log10GenotypeLikelihoods[0];
final double log10ACeq0Prior = log10Priors[0];
final double log10ACeq0Posterior = log10ACeq0Likelihood + log10ACeq0Prior;
// If the Maximum a-posteriori AC is 0 then the profile value must be 0.0 as per existing code; it does
// not matter whether a AC > 0 is at all plausible.
boolean mapACeq0 = true;
for (int AC = 1; AC < log10Priors.length; AC++)
if (log10Priors[AC] + log10GenotypeLikelihoods[AC] > log10ACeq0Posterior) {
mapACeq0 = false;
break;
}
if (mapACeq0)
return 0.0;
//TODO bad way to calculate AC > 0 posterior that follows the current behaviour of ExactAFCalculator (StateTracker)
//TODO this is the lousy part... this code just adds up lks and priors of AC != 0 before as if
//TODO Sum(a_i * b_i) is equivalent to Sum(a_i) * Sum(b_i)
//TODO This has to be changed not just here but also in the AFCalculators (StateTracker).
final double log10ACgt0Likelihood = MathUtils.approximateLog10SumLog10(log10GenotypeLikelihoods, 1, log10GenotypeLikelihoods.length);
final double log10ACgt0Prior = MathUtils.approximateLog10SumLog10(log10Priors, 1, log10Priors.length);
final double log10ACgt0Posterior = log10ACgt0Likelihood + log10ACgt0Prior;
final double log10PosteriorNormalizationConstant = MathUtils.approximateLog10SumLog10(log10ACeq0Posterior, log10ACgt0Posterior);
//TODO End of lousy part.
final double normalizedLog10ACeq0Posterior = log10ACeq0Posterior - log10PosteriorNormalizationConstant;
// This is another condition to return a 0.0 also present in AFCalculator code as well.
if (normalizedLog10ACeq0Posterior >= QualityUtils.qualToErrorProbLog10(configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING))
return 0.0;
return 1.0 - Math.pow(10.0, normalizedLog10ACeq0Posterior);
}
}
}

View File

@ -222,6 +222,7 @@ final class StateTracker {
@Requires("allelesUsedInGenotyping != null")
protected AFCalculationResult toAFCalculationResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
//TODO bad calculation of normalized log10 ACeq0 and ACgt0 likelihoods, priors and consequently posteriors calculated in AFCalculationResult constructor.
final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true);
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);

View File

@ -811,9 +811,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
final List<Allele> alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE);
final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
final double isActiveProb;
if (genotypes.size() == 1) {
// Faster implementation avoiding the costly and over complicated Exact AFCalculator machinery:
// This is the case when doing GVCF output.
isActiveProb = activeRegionEvaluationGenotyperEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector());
} else {
final VariantCallContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.SNP);
isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb(vcOut.getPhredScaledQual());
}
return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > AVERAGE_HQ_SOFTCLIPS_HQ_BASES_THRESHOLD ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() );
}