Merged bug fix from Stable into Unstable

2012-05-25 13:03:05 -04:00 · 2012-05-25 13:03:05 -04:00 · 4109fcbb08
parent 175bb35e70 2be5704a25
commit 4109fcbb08
1 changed files with 37 additions and 105 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -25,15 +25,11 @@

 package org.broadinstitute.sting.gatk.walkers.indels;

-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.PairHMM;
 import org.broadinstitute.sting.utils.clipping.ReadClipper;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -43,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.Allele;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
-import java.util.Map;


 public class PairHMMIndelErrorModel {
@ -58,7 +53,6 @@ public class PairHMMIndelErrorModel {
    private static final double baseMismatchArray[];

    private final static double LOG_ONE_HALF;
-    private final static double END_GAP_COST;

    private static final int START_HRUN_GAP_IDX = 4;
    private static final int MAX_HRUN_GAP_IDX = 20;
@ -76,7 +70,6 @@ public class PairHMMIndelErrorModel {

    static {
        LOG_ONE_HALF= -Math.log10(2.0);
-        END_GAP_COST = LOG_ONE_HALF;

        baseMatchArray = new double[MAX_CACHED_QUAL+1];
        baseMismatchArray = new double[MAX_CACHED_QUAL+1];
@ -128,7 +121,6 @@ public class PairHMMIndelErrorModel {
        // compute forward hrun length, example:
        // AGGTGACCCCCCTGAGAG
        // 001000012345000000
-        int runCount = 0;
        hrunArray[0] = 0;
        int[] hforward = new int[hrunArray.length];
        int[] hreverse = new int[hrunArray.length];
@ -172,17 +164,13 @@ public class PairHMMIndelErrorModel {
            }
        }
    }
-    public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
-                                                                 ReferenceContext ref, int eventLength,
-                                                                 HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
-
-        int numHaplotypes = haplotypeMap.size();
+    public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap, ReferenceContext ref, int eventLength, HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
+        final int numHaplotypes = haplotypeMap.size();
        final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes];
        final int readCounts[] = new int[pileup.getNumberOfElements()];
+        final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
+
        int readIdx=0;
-
-
-        PairHMM pairHMM = new PairHMM(bandedLikelihoods);
        for (PileupElement p: pileup) {
            // > 1 when the read is a consensus read representing multiple independent observations
            readCounts[readIdx] = p.getRepresentativeCount();
@ -199,84 +187,46 @@ public class PairHMMIndelErrorModel {
                if (DEBUG) {
                    System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
                }
-                // System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName());
                GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());

                if (read.isEmpty())
                    continue;

-                if (read.getUnclippedEnd() > ref.getWindow().getStop())
+                if (read.getSoftEnd() > ref.getWindow().getStop())
                    read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());

                if (read.isEmpty())
                    continue;

-                if (read.getUnclippedStart() < ref.getWindow().getStart())
+                if (read.getSoftStart() < ref.getWindow().getStart())
                    read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());

                if (read.isEmpty())
                    continue;
+
                // hard-clip low quality ends - this may introduce extra H elements in CIGAR string
-                read = ReadClipper.hardClipLowQualEnds(read,(byte)BASE_QUAL_THRESHOLD );
+                read = ReadClipper.hardClipLowQualEnds(read, (byte) BASE_QUAL_THRESHOLD );

                if (read.isEmpty())
                    continue;

-
                // get bases of candidate haplotypes that overlap with reads
                final int trailingBases = 3;
-
-                long readStart = read.getUnclippedStart();
-                long readEnd = read.getUnclippedEnd();
-
-                int numStartSoftClippedBases, numEndSoftClippedBases;
+                final long readStart = read.getSoftStart();
+                final long readEnd = read.getSoftEnd();

                // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
                // but they're actually consistent with the insertion!
                // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
                // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
-                long eventStartPos = ref.getLocus().getStart();
-
-                // compute total number of clipped bases (soft or hard clipped)
-                numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
-                numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
-
-                // check for hard clips (never consider these bases):
-                Cigar c = read.getCigar();
-                CigarElement first = c.getCigarElement(0);
-                CigarElement last = c.getCigarElement(c.numCigarElements()-1);
-                int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
-
-                if (first.getOperator() == CigarOperator.H) {
-                    numStartHardClippedBases = first.getLength();
-                }
-
-                if (last.getOperator() == CigarOperator.H) {
-                    numEndHardClippedBases = last.getLength();
-                }
-
-                // correct for hard clips
-                numStartSoftClippedBases -= numStartHardClippedBases;
-                numEndSoftClippedBases -= numEndHardClippedBases;
-                readStart += numStartHardClippedBases;
-                readEnd -= numEndHardClippedBases;
-
-                // remove soft clips if necessary
-                if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
-                        (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
-                    numStartSoftClippedBases = 0;
-                    numEndSoftClippedBases = 0;
-                }
-
-
-
-                byte[] unclippedReadBases, unclippedReadQuals;
-
-                int numStartClippedBases = numStartSoftClippedBases;
-                int numEndClippedBases = numEndSoftClippedBases;
-                unclippedReadBases = read.getReadBases();
-                unclippedReadQuals = read.getBaseQualities();
+                final long eventStartPos = ref.getLocus().getStart();

+                // compute total number of clipped bases (soft or hard clipped) and only use them if necessary
+                final boolean softClips = useSoftClippedBases(read, eventStartPos, eventLength);
+                final int numStartSoftClippedBases = softClips ? read.getAlignmentStart()- read.getSoftStart() : 0;
+                final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
+                final byte [] unclippedReadBases = read.getReadBases();
+                final byte [] unclippedReadQuals = read.getBaseQualities();
                final int extraOffset = Math.abs(eventLength);

                /**
@ -284,64 +234,53 @@ public class PairHMMIndelErrorModel {
                 * Read start and stop locations (variables readStart and readEnd) are the original unclipped positions from SAMRecord,
                 * adjusted by hard clips from Cigar string and by qual-based soft-clipping performed above.
                 * We will propose haplotypes that overlap the read with some padding.
-                 * True read start = readStart + numStartClippedBases - ReadUtils.getFirstInsertionOffset(read)
+                 * True read start = readStart + numStartSoftClippedBases - ReadUtils.getFirstInsertionOffset(read)
                 * Last term is because if a read starts with an insertion then these bases are not accounted for in readStart.
                 * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
                 * differentiate context between two haplotypes
                 */
-                long startLocationInRefForHaplotypes = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
-                long stopLocationInRefForHaplotypes =  readEnd -numEndClippedBases  + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
+                long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
+                long stopLocationInRefForHaplotypes =  readEnd -numEndSoftClippedBases  + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;

                if (DEBUG)
                    System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);

                int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
-                // check if start of read will be before start of reference context
+
                if (startLocationInRefForHaplotypes < ref.getWindow().getStart()) {
-                    // read starts before haplotype: read will have to be cut
-                    //numStartClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
-                    startLocationInRefForHaplotypes = ref.getWindow().getStart();
+                    startLocationInRefForHaplotypes = ref.getWindow().getStart();                                       // read starts before haplotype: read will have to be cut numStartSoftClippedBases += ref.getWindow().getStart() - startLocationInRefForHaplotypes;
                }
-                // check also if end of read will go beyond reference context
+
                if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
-                    //numEndClippedBases += stopLocationInRefForHaplotypes - ref.getWindow().getStop();
-                    stopLocationInRefForHaplotypes = ref.getWindow().getStop();
+                    stopLocationInRefForHaplotypes = ref.getWindow().getStop();                                         // check also if end of read will go beyond reference context
                }

-                // if there's an insertion in the read, the read stop position will be less than start + read legnth,
-                // but we want to compute likelihoods in the whole region that a read might overlap
                if (stopLocationInRefForHaplotypes <= startLocationInRefForHaplotypes + readLength) {
-                    stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1;
+                    stopLocationInRefForHaplotypes = startLocationInRefForHaplotypes + readLength-1;                    // if there's an insertion in the read, the read stop position will be less than start + read legnth, but we want to compute likelihoods in the whole region that a read might overlap
                }

-                // ok, we now figured out total number of clipped bases on both ends.
+                // ok, we now figured out the total number of clipped bases on both ends.
                // Figure out where we want to place the haplotype to score read against

                if (DEBUG)
-                    System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
-                            numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());
-
+                    System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
+                            numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength());

                LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();

                /**
                 * Check if we'll end up with an empty read once all clipping is done
                 */
-                if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
+                if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) {
                    int j=0;
                    for (Allele a: haplotypeMap.keySet()) {
                        readEl.put(a,0.0);
                        readLikelihoods[readIdx][j++] = 0.0;
                    }
-
                }
                else {
-                    final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
-                            unclippedReadBases.length-numEndClippedBases);
-
-                    final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
-                            unclippedReadBases.length-numEndClippedBases);
-
+                    final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
+                    final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
                    int j=0;

                    // initialize path metric and traceback memories for likelihood computation
@ -351,7 +290,7 @@ public class PairHMMIndelErrorModel {
                    final byte[] contextLogGapContinuationProbabilities  = new byte[readBases.length];

                    // get homopolymer length profile for current haplotype
-                    int[] hrunProfile = new int[readBases.length];
+                    final int[] hrunProfile = new int[readBases.length];
                    getContextHomopolymerLength(readBases,hrunProfile);
                    fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);

@ -431,6 +370,10 @@ public class PairHMMIndelErrorModel {
        return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
    }

+    private boolean useSoftClippedBases(GATKSAMRecord read, long eventStartPos, int eventLength) {
+        return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
+    }
+
    private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
        if (b1.length != b2.length)
            return 0; // sanity check
@ -442,18 +385,7 @@ public class PairHMMIndelErrorModel {
        return b1.length;
    }

-    private int computeFirstDifferingPosition(double[] b1, double[] b2) {
-        if (b1.length != b2.length)
-            return 0; // sanity check
-
-        for (int i=0; i < b1.length; i++ ){
-            if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 )
-                return i;
-        }
-        return b1.length;
-    }
-
-    private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
+    private static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
        final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];

        // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix