Optimization to pooled calculation model: stop calculating P(D|AF) if we are beyond the max likelihood such that subsequent likelihoods won't factor into the confidence score. Also, use new Pileup interface.

Pooled calling now takes less than half the time it used to. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2449 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-28 18:39:55 +00:00 · 2009-12-28 18:39:55 +00:00 · c8d0e6e004
parent b1ac4b81d5
commit c8d0e6e004
1 changed files with 25 additions and 13 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PooledCalculationModel.java
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;

 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.QualityUtils;

@ -12,6 +13,8 @@ import net.sf.samtools.SAMRecord;
 public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel {
    protected static final String POOL_SAMPLE_NAME = "POOL";

+    private static final Double LOG10_OPTIMIZATION_EPSILON = 8.0;
+
    private static FourBaseProbabilities fourBaseLikelihoods = null;
    private static boolean USE_CACHE = true;

@ -52,24 +55,33 @@ public class PooledCalculationModel extends JointEstimateGenotypeCalculationMode
        int refIndex = BaseUtils.simpleBaseToBaseIndex(ref);
        int altIndex = BaseUtils.simpleBaseToBaseIndex(alt);

-        for (int i = 0; i < pileup.getReads().size(); i++) {
-            int offset = pileup.getOffsets().get(i);
+        double maxLikelihoodSeen = -1.0 * Double.MAX_VALUE;

-            // ignore deletions
-            if ( offset == -1 )
-                continue;
+        // for each minor allele frequency, calculate log10PofDgivenAFi
+        for (int frequency = 0; frequency <= nChromosomes; frequency++) {

-            SAMRecord read = pileup.getReads().get(i);
-            char base = (char)read.getReadBases()[offset];
-            int bIndex = BaseUtils.simpleBaseToBaseIndex(base);
-            byte qual = read.getBaseQualities()[offset];
+            for ( PileupElement p : pileup ) {
+                // ignore deletions
+                if ( p.isDeletion() )
+                    continue;

-            if ( qual > 0 && bIndex != -1 ) {
+                char base = (char)p.getBase();
+                byte qual = p.getQual();

-                // for each minor allele frequency, calculate log10PofDgivenAFi
-                for (int frequency = 0; frequency <= nChromosomes; frequency++)
-                    log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, read, offset);
+                if ( qual > 0 && BaseUtils.simpleBaseToBaseIndex(base) != -1 )
+                    log10PofDgivenAFi[altIndex][frequency] += calcPBGivenH(refIndex, altIndex, frequency, nChromosomes, base, qual, p.getRead(), p.getOffset());
            }
+
+            // an optimization to speed up the calculation: if we are beyond the local maximum such
+            //  that subsequent likelihoods won't factor into the confidence score, just quit
+            if ( frequency > 0 && maxLikelihoodSeen - log10PofDgivenAFi[altIndex][frequency] > LOG10_OPTIMIZATION_EPSILON ) {
+                while ( ++frequency <= nChromosomes )
+                    log10PofDgivenAFi[altIndex][frequency] = -1.0 * Double.MAX_VALUE;
+                return;
+            }
+
+            if ( log10PofDgivenAFi[altIndex][frequency] > maxLikelihoodSeen )
+                maxLikelihoodSeen = log10PofDgivenAFi[altIndex][frequency];
        }
    }