From f5d910b8a54b1dbceab506a7260ebaf1d9c4796b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 23 Oct 2011 13:29:08 -0400 Subject: [PATCH] Haplotype caller now sends genotype likelihoods to the exact model to genotype the events found in the best haplotypes. --- .../AlleleFrequencyCalculationModel.java | 6 +- .../genotyper/ExactAFCalculationModel.java | 4 +- .../genotyper/GridSearchAFEstimation.java | 4 +- .../MultiallelicGenotypeLikelihoods.java | 7 +- .../genotyper/UnifiedGenotyperEngine.java | 79 ++++++++++++++++++- 5 files changed, 82 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 1bb697056..35a9fe31d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -68,16 +68,12 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { /** * Must be overridden by concrete subclasses - * @param tracker rod data - * @param ref reference context * @param GLs genotype likelihoods * @param Alleles Alleles corresponding to GLs * @param log10AlleleFrequencyPriors priors * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results */ - protected abstract void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, List Alleles, + protected abstract void getLog10PNonRef(Map GLs, List Alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index f87eae781..1c2d82ab7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -51,9 +51,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, List alleles, + public void getLog10PNonRef(Map GLs, List alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { final int numAlleles = alleles.size(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java index f4195e5f0..27842a8bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -52,9 +52,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { AFMatrix = new AlleleFrequencyMatrix(N); } - protected void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, List alleles, + protected void getLog10PNonRef(Map GLs, List alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { initializeAFMatrix(GLs); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java index 3652763de..4f378b24a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.ArrayList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -15,11 +16,11 @@ import java.util.ArrayList; public class MultiallelicGenotypeLikelihoods { private String sample; private double[] GLs; - private ArrayList alleleList; + private List alleleList; private int depth; public MultiallelicGenotypeLikelihoods(String sample, - ArrayList A, + List A, double[] log10Likelihoods, int depth) { /* Check for consistency between likelihood vector and number of alleles */ int numAlleles = A.size(); @@ -40,7 +41,7 @@ public class MultiallelicGenotypeLikelihoods { return GLs; } - public ArrayList getAlleles() { + public List getAlleles() { return alleleList; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index b72b68f9f..adb423145 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -325,7 +325,7 @@ public class UnifiedGenotyperEngine { // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); // find the most likely frequency int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); @@ -383,7 +383,7 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); @@ -391,7 +391,7 @@ public class UnifiedGenotyperEngine { // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -400,7 +400,7 @@ public class UnifiedGenotyperEngine { // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -447,6 +447,77 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) { + + // initialize the data for this thread if that hasn't been done yet + if ( afcm.get() == null ) { + log10AlleleFrequencyPosteriors.set(new double[N+1]); + afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + } + + // estimate our confidence in a reference call and return + if ( vc.getNSamples() == 0 ) + return null; + + // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) + clearAFarray(log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + + // find the most likely frequency + int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + + // calculate p(f>0) + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); + double sum = 0.0; + for (int i = 1; i <= N; i++) + sum += normalizedPosteriors[i]; + double PofF = Math.min(sum, 1.0); // deal with precision errors + + double phredScaledConfidence; + if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + if ( Double.isInfinite(phredScaledConfidence) ) + phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + } else { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + if ( Double.isInfinite(phredScaledConfidence) ) { + sum = 0.0; + for (int i = 1; i <= N; i++) { + if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + break; + sum += log10AlleleFrequencyPosteriors.get()[i]; + } + phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); + } + } + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return null; + } + + // create the genotypes + Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + HashMap attributes = new HashMap(); + + int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); + + Set myAlleles = new HashSet(vc.getAlleles()); + // strip out the alternate allele if it's a ref call + if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { + myAlleles = new HashSet(1); + myAlleles.add(vc.getReference()); + } + VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, + myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel()); + + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); + } + private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // TODO - temp fix until we can deal with extended events properly // for indels, stop location is one more than ref allele length