Support for EXPERIMENT sampling-based genotype likelihoods

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3044 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-19 13:19:40 +00:00 · 2010-03-19 13:19:40 +00:00 · d8ff552311
parent 7b17bcd0af
commit d8ff552311
4 changed files with 190 additions and 4 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java
@ -12,8 +12,14 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
 import java.util.*;

 public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel {
+    /**
+     * Should we enable the experimental genotype likelihood calculations?
+     */
+    protected boolean useExptGenotypeLikelihoods = false;

-    protected DiploidGenotypeCalculationModel() {}
+    protected DiploidGenotypeCalculationModel(boolean useExptGenotypeLikelihoods) {
+        this.useExptGenotypeLikelihoods = useExptGenotypeLikelihoods;
+    }

    // the GenotypeLikelihoods map
    private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
@ -21,7 +27,9 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul

    private enum GenotypeType { REF, HET, HOM }

-    protected void initialize(char ref, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
+    protected void initialize(char ref,
+                              Map<String, StratifiedAlignmentContext> contexts,
+                              StratifiedAlignmentContext.StratifiedContextType contextType) {
        // initialize the GenotypeLikelihoods
        GLs.clear();
        AFMatrixMap.clear();
@ -40,7 +48,13 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul
            ReadBackedPileup pileup = context.getContext(contextType).getBasePileup();

            // create the GenotypeLikelihoods object
-            GenotypeLikelihoods GL = new GenotypeLikelihoods(baseModel, priors, defaultPlatform);
+            GenotypeLikelihoods GL;
+            if ( useExptGenotypeLikelihoods ) {
+                GL = new SamplingGenotypeLikelihoods(baseModel, priors, defaultPlatform);
+            } else {
+                GL = new GenotypeLikelihoods(baseModel, priors, defaultPlatform);
+            }
+
            GL.add(pileup, true);
            GLs.put(sample, GL);

--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -19,6 +19,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {

    public enum Model {
        JOINT_ESTIMATE,
+        JOINT_ESTIMATE_EXPT_GL,
        POOLED,
        INDELS
    }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java
@ -65,7 +65,9 @@ public class GenotypeCalculationModelFactory {
        GenotypeCalculationModel gcm;
        switch ( UAC.genotypeModel ) {
            case JOINT_ESTIMATE:
-                gcm = new DiploidGenotypeCalculationModel();
+            case JOINT_ESTIMATE_EXPT_GL:
+                boolean useExptGenotypeLikelihoods = UAC.genotypeModel == JOINT_ESTIMATE_EXPT_GL;
+                gcm = new DiploidGenotypeCalculationModel(useExptGenotypeLikelihoods);
                break;
            case POOLED:
                gcm = new PooledCalculationModel();
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SamplingGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SamplingGenotypeLikelihoods.java
@ -0,0 +1,169 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
+
+import static java.lang.Math.log10;
+import static java.lang.Math.pow;
+import java.util.Arrays;
+import java.util.Comparator;
+
+public class SamplingGenotypeLikelihoods extends GenotypeLikelihoods {
+    /**
+     * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
+     *
+     * @param m base model
+     */
+    public SamplingGenotypeLikelihoods(BaseMismatchModel m) {
+        super(m);
+        enableCacheFlag = false;
+    }
+
+    /**
+     * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
+     *
+     * @param m base model
+     * @param pl default platform
+     */
+    public SamplingGenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
+        super(m, pl);
+        enableCacheFlag = false;
+    }
+
+    /**
+     * Create a new GenotypeLikelhoods object with given priors for each diploid genotype
+     *
+     * @param m base model
+     * @param priors priors
+     */
+    public SamplingGenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors) {
+        super(m, priors);
+        enableCacheFlag = false;
+    }
+
+    /**
+     * Create a new GenotypeLikelhoods object with given priors for each diploid genotype
+     *
+     * @param m base model
+     * @param priors priors
+     * @param pl default platform
+     */
+    public SamplingGenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
+        super(m, priors, pl);
+        enableCacheFlag = false;
+    }
+
+    /**
+     * Cloning of the object
+     * @return clone
+     * @throws CloneNotSupportedException
+     */
+    protected Object clone() throws CloneNotSupportedException {
+        return super.clone();
+    }
+
+    /**
+     * Updates likelihoods and posteriors to reflect the additional observations contained within the
+     * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
+     * pileup
+     *
+     * @param pileup         read pileup
+     * @param ignoreBadBases should we ignore bad bases
+     * @return the number of good bases found in the pileup
+     */
+    public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
+        // we're actually going to be happy with our homozygous theories
+        int n = super.add(pileup, ignoreBadBases);
+
+        // Now, loop over the heterygous theories and do our fancy sampling calculation
+        FourBaseProbabilities[] probs = fourBaseProbVector(n, pileup, ignoreBadBases);
+        for ( DiploidGenotype g : DiploidGenotype.values() ) {
+            if ( g.isHet() ) {
+                //System.out.printf("DEBUG: computing best k for %s at %s%n", g, pileup.getLocation());
+                double likelihood = calculateSamplingHetGenotypeLikelihood(probs, g, n);
+                //System.out.printf("DEBUG: L was %.2f%n", likelihood);
+                if ( likelihood != 1 ) {
+                    log10Likelihoods[g.ordinal()] = likelihood;
+                    log10Posteriors[g.ordinal()] = likelihood + priors.getPrior(g);
+                }
+            }
+        }
+
+        return n;
+    }
+
+    public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) {
+        throw new UnsupportedOperationException("BUG: Sampling genotype likelihoods does not support sequential addition of bases; use add(ReadBackedPileup) instead");
+    }
+
+
+    private double calculateSamplingHetGenotypeLikelihood(FourBaseProbabilities[] probs, DiploidGenotype g, int goodBaseDepth) {
+        double bestLog10L = 1;
+        //int bestK = -1;
+
+        for ( int k = 0; k < goodBaseDepth; k++ ) {
+            // for every possible sampling depth of the A allele
+            double log10BinomialSampling = Math.log10(MathUtils.binomialProbabilityLog(k, goodBaseDepth, 0.5));
+            double log10Het = mostLikelyKBases(probs, k, BaseUtils.simpleBaseToBaseIndex(g.base1), BaseUtils.simpleBaseToBaseIndex(g.base2));
+
+            double log10L = log10BinomialSampling + log10Het;
+            //System.out.printf("  DEBUG: computing best k = %d, bin = %.2f, het = %.2f, log10L = %.2f%n", k, log10BinomialSampling, log10Het, log10L);
+
+            if ( log10L > bestLog10L || bestLog10L == 1 ) {
+                bestLog10L = log10L;
+                //bestK = k;
+            }
+        }
+
+        //System.out.printf("  DEBUG: best k = %d with log10L of %.2f%n", bestK, bestLog10L);
+        return bestLog10L;
+    }
+
+    FourBaseProbabilities[] fourBaseProbVector(int n, ReadBackedPileup pileup, boolean ignoreBadBases) {
+        FourBaseProbabilities[] probs = new FourBaseProbabilities[n];
+
+        int i = 0;
+        for ( PileupElement p : pileup ) {
+            char observedBase = (char)p.getBase();
+            byte qualityScore = p.getQual();
+            SAMRecord read = p.getRead();
+            int offset = p.getOffset();
+
+            if ( ! ignoreBadBases || ! badBase(observedBase) ) {
+                FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
+
+                if ( fbl != null ) {
+                    probs[i++] = fbl;
+                }
+            }
+        }
+
+        return probs;
+    }
+
+    class FourBaseComparator implements Comparator<FourBaseProbabilities> {
+        int index = 0;
+        public FourBaseComparator(int i) { index = i; }
+        public int compare(FourBaseProbabilities a, FourBaseProbabilities b) {
+            return -1 * Double.compare(a.getLog10Likelihood(index), b.getLog10Likelihood(index));
+        }
+    }
+
+    protected static final double log103 = log10(3.0);
+    double mostLikelyKBases(FourBaseProbabilities[] probs, int k, int base1, int base2) {
+        if ( k == 0 ) // optimization -- if k > 1, we've already sorted the vector
+            Arrays.sort(probs, new FourBaseComparator(base1));
+
+        double log10L = 0;
+        for ( int i = 0; i < probs.length; i++ ) {
+            int base = i < k ? base1 : base2;
+            log10L += probs[i].getLog10Likelihood(base) - log103;
+            //System.out.printf("%3d %d %.2f %.2f %.2f%n", i, base, probs[i].getLog10Likelihood(base), probs[i].getLog10Likelihood(base1), probs[i].getLog10Likelihood(base2));
+        }
+
+        return log10L;
+    }
+}