A clean, fast way to compute fragment pileups. Now consumes no CPU time at all. Ready for general use.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5524 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-27 14:26:29 +00:00 · 2011-03-27 14:26:29 +00:00 · 231d095316
parent bae0b6cba8
commit 231d095316
2 changed files with 99 additions and 70 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java
@ -34,10 +34,6 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.genotype.DiploidGenotype;

-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Set;
-
 import static java.lang.Math.log10;
 import static java.lang.Math.pow;

@ -240,6 +236,14 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
        return getPriors()[g.ordinal()];
    }

+    // -------------------------------------------------------------------------------------
+    //
+    // add() routines.  These are the workhorse routines for calculating the overall genotype
+    // likelihoods given observed bases and reads.  Includes high-level operators all the
+    // way down to single base and qual functions.
+    //
+    // -------------------------------------------------------------------------------------
+
    public int add(ReadBackedPileup pileup) {
        return add(pileup, false, false);
    }
@ -258,59 +262,58 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
        int n = 0;

        // for each fragment, add to the likelihoods
-        for ( FragmentPileup.PerFragmentPileupElement fragment : new FragmentPileup(pileup) ) {
-            n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual);
-        }
+        FragmentPileup fpile = new FragmentPileup(pileup);
+
+        for ( PileupElement p : fpile.getOneReadPileup() )
+            n += add(p, ignoreBadBases, capBaseQualsAtMappingQual);
+
+        for ( FragmentPileup.TwoReadPileupElement twoRead : fpile.getTwoReadPileup() )
+            n += add(twoRead, ignoreBadBases, capBaseQualsAtMappingQual);

        return n;
    }
    public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
-        return add(new FragmentPileup.PerFragmentPileupElement(elt), ignoreBadBases, capBaseQualsAtMappingQual);
+        byte obsBase = elt.getBase();
+        byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual);
+        return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
    }

-    private final static byte qualToUse(PileupElement p, boolean capBaseQualsAtMappingQual) {
-        byte qual = p.getQual();
+    public int add(FragmentPileup.TwoReadPileupElement twoRead, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
+        final byte observedBase1 = twoRead.getFirst().getBase();
+        final byte qualityScore1 = qualToUse(twoRead.getFirst(), ignoreBadBases, capBaseQualsAtMappingQual);
+        final byte observedBase2 = twoRead.getSecond().getBase();
+        final byte qualityScore2 = qualToUse(twoRead.getSecond(), ignoreBadBases, capBaseQualsAtMappingQual);

-        if ( qual > SAMUtils.MAX_PHRED_SCORE )
-            throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s.  Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
-        if ( capBaseQualsAtMappingQual )
-            qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
-
-        return qual;
+        if ( qualityScore1 == 0 ) {
+            if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
+                return 0;
+            else {
+                return add(observedBase2, qualityScore2, (byte)0, (byte)0);
+            }
+        } else {
+            return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
+        }
    }

-    public int add(FragmentPileup.PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
+    /**
+     *
+     * @param obsBase1
+     * @param qual1
+     * @param obsBase2
+     * @param qual2 can be 0, indicating no second base was observed for this fragment
+     * @return
+     */
+    private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
        // TODO-- Right now we assume that there are at most 2 reads per fragment.  This assumption is fine
        // TODO--   given the current state of next-gen sequencing, but may need to be fixed in the future.
        // TODO--   However, when that happens, we'll need to be a lot smarter about the caching we do here.
-        byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0;
-
-        if ( usableBase(fragment.getFirst(), ignoreBadBases) ) {
-            observedBase1 = fragment.getFirst().getBase();
-            qualityScore1 = qualToUse(fragment.getFirst(), capBaseQualsAtMappingQual);
-        }
-
-        if ( fragment.hasSecond() && usableBase(fragment.getSecond(), ignoreBadBases) ) {
-            observedBase2 = fragment.getSecond().getBase();
-            qualityScore2 = qualToUse(fragment.getSecond(), capBaseQualsAtMappingQual);
-        }
-
-        if ( observedBase1 == 0 ) {
-            if ( observedBase2 == 0 ) // abort early if we didn't see any good bases
-                return 0;
-            else { // otherwise make 2 1
-                observedBase1 = observedBase2;
-                qualityScore1 = qualityScore2;
-                observedBase2 = qualityScore2 = 0;
-            }
-        }

        // Just look up the cached result if it's available, or compute and store it
        DiploidSNPGenotypeLikelihoods gl;
-        if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) {
-            gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
+        if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
+            gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
        } else {
-            gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
+            gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
        }

        // for bad bases, there are no likelihoods
@ -334,6 +337,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
        return 1;
    }

+    // -------------------------------------------------------------------------------------
+    //
+    // Dealing with the cache routines
+    //
+    // -------------------------------------------------------------------------------------
+
    static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];

    protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
@ -475,6 +484,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
        return logP;
    }

+    /**
+     * Helper function that returns the phred-scaled base quality score we should use for calculating
+     * likelihoods for a pileup element.  May return 0 to indicate that the observation is bad, and may
+     * cap the quality score by the mapping quality of the read itself.
+     *
+     * @param p
+     * @param ignoreBadBases
+     * @param capBaseQualsAtMappingQual
+     * @return
+     */
+    private final static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
+        if ( ! usableBase(p, ignoreBadBases) ) {
+            return 0;
+        } else {
+            byte qual = p.getQual();
+
+            if ( qual > SAMUtils.MAX_PHRED_SCORE )
+                throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s.  Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
+            if ( capBaseQualsAtMappingQual )
+                qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
+
+            return qual;
+        }
+    }
+
    // -----------------------------------------------------------------------------------------------------------------
    //
    //
--- a/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java
+++ b/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java
@ -3,17 +3,23 @@ package org.broadinstitute.sting.utils.pileup;
 import java.util.*;

 /**
- * An easy to access fragment-based pileup.  new FragmentPileup(RBP) creates one, and you
- * can either iterate over or get the collection of PerFragmentPileupElements.
+ * An easy to access fragment-based pileup, which contains two separate pileups.  The first
+ * is a regular collection of PileupElements containing all of the reads in the original RBP
+ * that uniquely info about a fragment.  The second are TwoReadPileupElements that, as the
+ * name suggests, contain two reads that are sequenced from the same underlying fragment.
 *
 * Based on the original code by E. Banks
 *
+ * TODO -- technically we could generalize this code to support a pseudo-duplicate marking
+ * TODO -- algorithm that could collect all duplicates into a single super pileup element
+ *
 * User: depristo
 * Date: 3/26/11
 * Time: 10:09 PM
 */
-public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileupElement> {
-    final Collection<PerFragmentPileupElement> fragments = new ArrayList<PerFragmentPileupElement>();
+public class FragmentPileup {
+    final Collection<PileupElement> oneReadPile;
+    final Collection<TwoReadPileupElement> twoReadPile = new ArrayList<TwoReadPileupElement>();

    /**
     * Create a new Fragment-based pileup from the standard read-based pileup
@ -29,58 +35,50 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
            PileupElement pe1 = nameMap.get(readName);
            if ( pe1 != null ) {
                // assumes we have at most 2 reads per fragment
-                fragments.add(new PerFragmentPileupElement(pe1, p));
+                twoReadPile.add(new TwoReadPileupElement(pe1, p));
                nameMap.remove(readName);
            } else {
                nameMap.put(readName, p);
            }
        }

-        // now go through the values in the nameMap to get the fragments with only a single read
-        for ( PileupElement p : nameMap.values() )
-            fragments.add(new PerFragmentPileupElement(p));
+        // now set the one Read pile to the values in the nameMap with only a single read
+        oneReadPile = nameMap.values();
    }

    /**
-     * Gets the fragments, in no particular order
+     * Gets the pileup elements containing two reads, in no particular order
     *
     * @return
     */
-    public Collection<PerFragmentPileupElement> getFragments() {
-        return fragments;
+    public Collection<TwoReadPileupElement> getTwoReadPileup() {
+        return twoReadPile;
    }

    /**
-     * Returns an iterator over the fragments.  No specific order of fragments is assumed
+     * Gets the pileup elements containing one read, in no particular order
+     *
     * @return
     */
-    public Iterator<PerFragmentPileupElement> iterator() {
-        return fragments.iterator();
+    public Collection<PileupElement> getOneReadPileup() {
+        return oneReadPile;
    }

    /**
     * Useful helper class to represent a full read pair at a position
     *
-     * User: ebanks
+     * User: ebanks, depristo
     * Date: Jan 10, 2011
     */
-    public static class PerFragmentPileupElement {
-        protected PileupElement PE1 = null, PE2 = null;
-
-        /**
-         * Creates a fragment element that only contains a single read
-         * @param PE
-         */
-        public PerFragmentPileupElement(PileupElement PE) {
-            PE1 = PE;
-        }
+    public static class TwoReadPileupElement {
+        final protected PileupElement PE1, PE2;

        /**
         * Creates a fragment element that contains both ends of a paired end read
         * @param PE1
         * @param PE2
         */
-        public PerFragmentPileupElement(PileupElement PE1, PileupElement PE2) {
+        public TwoReadPileupElement(PileupElement PE1, PileupElement PE2) {
            this.PE1 = PE1;
            this.PE2 = PE2;
        }
@ -88,9 +86,6 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
        /** Returns the first pileup element -- never null */
        public PileupElement getFirst() { return PE1; }

-        /** Is there a second read in this fragment element? */
-        public boolean hasSecond() { return PE2 != null; }
-
        /** Returns the second read in this fragment element.  May be null */
        public PileupElement getSecond() { return PE2; }
    }