Refactoring of PairHMM to support reduced reads

2011-09-26 13:28:56 -04:00 · 2011-09-26 13:28:56 -04:00 · fa0efbc4ca
parent a6b65d6347
commit fa0efbc4ca
1 changed files with 151 additions and 144 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
 import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
@ -711,8 +712,8 @@ public class PairHMMIndelErrorModel {
                                                                 HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){

        int numHaplotypes = haplotypeMap.size();
-        double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
-        double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
+        final double readLikelihoods[][] = new double[pileup.size()][numHaplotypes];
+        final int readCounts[] = new int[pileup.size()];
        int readIdx=0;

        LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
@ -751,6 +752,9 @@ public class PairHMMIndelErrorModel {
            }
        }
        for (PileupElement p: pileup) {
+            // > 1 when the read is a consensus read representing multiple independent observations
+            final boolean isReduced = ReadUtils.isReducedRead(p.getRead());
+            readCounts[readIdx] = isReduced ? p.getReducedCount() : 1;

            // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
            if (indelLikelihoodMap.containsKey(p)) {
@ -762,10 +766,14 @@ public class PairHMMIndelErrorModel {
            }
            else {
                //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
-                GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
+                SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
                if (read == null)
                    continue;

+                if ( isReduced ) {
+                    read = ReadUtils.reducedReadWithReducedQuals(read);
+                }
+
                if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
                    continue;
                }
@ -971,7 +979,7 @@ public class PairHMMIndelErrorModel {
                            System.out.println(new String(haplotypeBases));
                        }

-                        Double readLikelihood = 0.0;
+                        double readLikelihood = 0.0;
                        if (useAffineGapModel) {

                            double[] currentContextGOP = null;
@ -1004,7 +1012,7 @@ public class PairHMMIndelErrorModel {

        if (DEBUG) {
            System.out.println("\nLikelihood summary");
-            for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
+            for (readIdx=0; readIdx < pileup.size(); readIdx++) {
                System.out.format("Read Index: %d ",readIdx);
                for (int i=0; i < readLikelihoods[readIdx].length; i++)
                    System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]);
@ -1012,36 +1020,35 @@ public class PairHMMIndelErrorModel {
            }

        }
+
+        return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
+    }
+
+    private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
+        final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
+
+        // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
        for (int i=0; i < numHaplotypes; i++) {
            for (int j=i; j < numHaplotypes; j++){
                // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
                // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
                //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
-                 for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
-
+                for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) {
                    // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
                    // First term is approximated by Jacobian log with table lookup.
                    if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
                        continue;
-                    haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
-                            readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
-
+                    final double li = readLikelihoods[readIdx][i];
+                    final double lj = readLikelihoods[readIdx][j];
+                    final int readCount = readCounts[readIdx];
+                    haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
                }
-
-
            }
        }

-        return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
-
-    }
-
-    public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
-        int hSize = haplotypeLikehoodMatrix.length;
-        double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
-
+        final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
        int k=0;
-        for (int j=0; j < hSize; j++) {
+        for (int j=0; j < numHaplotypes; j++) {
            for (int i=0; i <= j; i++){
                genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j];
            }