First step in refactoring UG way of storing indel likelihoods - main motive is that rank sum annotations require per-read quality or likelihood information, and even the question "what allele of a variant is present in a read" which is trivial for SNPs may not be that straightforward for indels.

This step just changes storage of likelihoods so now we have, instead of an internal matrix, a class member which stores, as a hash table, a mapping from pileup element to an (allele, likelihood) pair. There's no functional change aside from internal data storage. As a bonus, we get for free a 2-3x improvement in speed in calling because redundant likelihood computations are removed. Next step will hook this up to, and redefine annotation engine interaction with UG for indel case. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5809 348d0f76-0448-11de-a6fe-93d51630548a
2011-05-15 23:04:11 +00:00 · 2011-05-15 23:04:11 +00:00 · 5a7444e186
parent 3ccc08ace4
commit 5a7444e186
4 changed files with 297 additions and 261 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@ -34,10 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
 import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.genotype.Haplotype;
 import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -57,9 +59,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
    private boolean DEBUG = false;
    private PairHMMIndelErrorModel pairModel;
-// gdebug removeme
+
-// todo -cleanup
+    private HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap;
-private HaplotypeIndelErrorModel model;
+    private LinkedHashMap<Allele,Haplotype> haplotypeMap;
    // gdebug removeme
    // todo -cleanup
    private HaplotypeIndelErrorModel model;
    private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
 //
    private GenomeLoc lastSiteVisited;
@ -84,13 +90,18 @@ private HaplotypeIndelErrorModel model;
            model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY,
                    INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO);
        }
-            pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
+
        pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
                    UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
        alleleList = new ArrayList<Allele>();
        getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
        minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
        HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
        DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
        indelLikelihoodMap = new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
        haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
    }
@ -215,7 +226,7 @@ private HaplotypeIndelErrorModel model;
            if (DEBUG) {
                int icount = indelPileup.getNumberOfInsertions();
-                 int dcount = indelPileup.getNumberOfDeletions();
+                int dcount = indelPileup.getNumberOfDeletions();
                if (icount + dcount > 0)
                {
                    List<Pair<String,Integer>> eventStrings = indelPileup.getEventStringsWithCounts(ref.getBases());
@ -294,7 +305,8 @@ private HaplotypeIndelErrorModel model;
            // starting a new site: clear allele list
            alleleList.clear();
            lastSiteVisited = ref.getLocus().clone();
-
+            indelLikelihoodMap.clear();
            haplotypeMap.clear();
            if (getAlleleListFromVCF) {
@ -341,7 +353,7 @@ private HaplotypeIndelErrorModel model;
        int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
        // assume only one alt allele for now
-        List<Haplotype> haplotypesInVC;
+        //List<Haplotype> haplotypesInVC;
        int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
        int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
@ -354,7 +366,7 @@ private HaplotypeIndelErrorModel model;
            System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength,
                    (int)ref.getWindow().size(), loc.getStart(), numPrefBases);
-        haplotypesInVC = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
+        haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(),
            ref, hsize, numPrefBases);
        // For each sample, get genotype likelihoods based on pileup
@ -362,9 +374,6 @@ private HaplotypeIndelErrorModel model;
        // initialize the GenotypeLikelihoods
        GLs.clear();
        double[][] haplotypeLikehoodMatrix;
        for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
            AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
@ -375,15 +384,14 @@ private HaplotypeIndelErrorModel model;
                pileup = context.getBasePileup();
            if (pileup != null ) {
                double[] genotypeLikelihoods;
                if (useOldWrongHorribleHackedUpLikelihoodModel)
-                    haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC);
+                    genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
                else
-                    haplotypeLikehoodMatrix = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, ref, HAPLOTYPE_SIZE, eventLength);
+                    genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap);
                double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getHaplotypeLikelihoods( haplotypeLikehoodMatrix);
                GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
                        refAllele,
                        altAllele,
@ -398,4 +406,7 @@ private HaplotypeIndelErrorModel model;
        return refAllele;
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
 import net.sf.samtools.AlignmentBlock;
 import net.sf.samtools.SAMRecord;
 import org.broad.tribble.util.variantcontext.Allele;
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
 import org.broadinstitute.sting.utils.MathUtils;
@ -37,6 +38,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 public class HaplotypeIndelErrorModel {
@ -419,7 +421,7 @@ public class HaplotypeIndelErrorModel {
    }
-    public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC){
+    public double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, HashMap<Allele,Haplotype> haplotypesInVC){
        double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
        double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
        int i=0;
@ -429,7 +431,8 @@ public class HaplotypeIndelErrorModel {
            }
            // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
            // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
-            for (int j=0; j < haplotypesInVC.size(); j++) {
+            int j=0;
            for (Allele a: haplotypesInVC.keySet()) {
                readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read);
                if (DEBUG) {
                    System.out.print(read.getReadName()+" ");
@ -438,7 +441,7 @@ public class HaplotypeIndelErrorModel {
                            read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
                            read.getCigarString(), readLikelihoods[i][j]);
                }
-
+                j++;
            }
            i++;
        }
@ -465,11 +468,11 @@ public class HaplotypeIndelErrorModel {
            }
        }
-        return haplotypeLikehoodMatrix;
+        return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
    }
-    public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
+    private double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
        int hSize = haplotypeLikehoodMatrix.length;
        double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -29,6 +29,7 @@ import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMRecord;
 import org.broad.tribble.util.variantcontext.Allele;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
 import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
@ -41,6 +42,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
 import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.genotype.Haplotype;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -48,9 +50,7 @@ import org.broadinstitute.sting.utils.text.XReadLines;
 import java.io.File;
 import java.io.FileNotFoundException;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.List;
 import java.util.regex.Pattern;
@ -697,29 +697,31 @@ public class PairHMMIndelErrorModel {
    }
-    private void fillGapProbabilities(int hIndex, int[] hrunProfile,
+    private void fillGapProbabilities(int[] hrunProfile,
-                                      double[][] contextLogGapOpenProbabilities, double[][] contextLogGapContinuationProbabilities) {
+                                      double[] contextLogGapOpenProbabilities, double[] contextLogGapContinuationProbabilities) {
        // fill based on lookup table
        for (int i = 0; i < hrunProfile.length; i++) {
            if (hrunProfile[i] >= MAX_HRUN_GAP_IDX) {
-                contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
+                contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
-                contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
+                contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[MAX_HRUN_GAP_IDX-1];
            }
            else {
-                contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
+                contextLogGapOpenProbabilities[i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
-                contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
+                contextLogGapContinuationProbabilities[i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
            }
        }
    }
-    public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
+    public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
-                                                                   ReferenceContext ref, int haplotypeSize, int eventLength){
+                                                                   ReferenceContext ref, int haplotypeSize, int eventLength,
-        double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
+                                                                   HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
-        double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
+
        int numHaplotypes = haplotypeMap.size();
        double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
        double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes];
        int readIdx=0;
-        double[][] contextLogGapOpenProbabilities = null;
+        LinkedHashMap<Allele,double[]> gapOpenProbabilityMap = new LinkedHashMap<Allele,double[]>();
-        double[][] contextLogGapContinuationProbabilities = null;
+        LinkedHashMap<Allele,double[]> gapContProbabilityMap = new LinkedHashMap<Allele,double[]>();
        if (DEBUG) {
            System.out.println("Reference bases:");
@ -727,15 +729,15 @@ public class PairHMMIndelErrorModel {
        }
        if (doContextDependentPenalties && !getGapPenaltiesFromFile)   {
-            // will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
+            // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes.
-            for (int j=0; j < haplotypesInVC.size(); j++) {
+
-                Haplotype haplotype = haplotypesInVC.get(j);
+            for (Allele a: haplotypeMap.keySet()) {
                Haplotype haplotype = haplotypeMap.get(a);
                byte[] haplotypeBases = haplotype.getBasesAsBytes();
-                if (contextLogGapOpenProbabilities == null) {
+                double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length];
-                    contextLogGapOpenProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
+                double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length];
-                    contextLogGapContinuationProbabilities = new double[haplotypesInVC.size()][haplotypeBases.length];
+
                }
                // get homopolymer length profile for current haplotype
                int[] hrunProfile = new int[haplotypeBases.length];
                getContextHomopolymerLength(haplotypeBases,hrunProfile);
@ -746,239 +748,261 @@ public class PairHMMIndelErrorModel {
                        System.out.format("%d",hrunProfile[i]);
                    System.out.println();
                }
-                fillGapProbabilities(j, hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
+                fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities);
                gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities);
                gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities);
            }
        }
-        for (SAMRecord pread : pileup.getReads()) {
+        for (PileupElement p: pileup) {
            GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
            if (read == null)
                continue;
-            if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
+            // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
-                continue;
+            if (indelLikelihoodMap.containsKey(p)) {
-            }
+                HashMap<Allele,Double> el =  indelLikelihoodMap.get(p);
-
+                int j=0;
-            double[] recalQuals = null;
+                for (Allele a: haplotypeMap.keySet()) {
-            if (getGapPenaltiesFromFile) {
+                    readLikelihoods[readIdx][j++] = el.get(a);
                RecalDataManager.parseSAMRecord( read, RAC );
                recalQuals = new double[read.getReadLength()];
                //compute all covariate values for this read
                final Comparable[][] covariateValues_offset_x_covar =
                        RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
                // For each base in the read
                for( int offset = 0; offset < read.getReadLength(); offset++ ) {
                    final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
                    Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
                    if(qualityScore == null)
                    {
                        qualityScore = performSequentialQualityCalculation( fullCovariateKey );
                        qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
                    }
                    recalQuals[offset] = -((double)qualityScore)/10.0;
                }
                // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
                // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
                if (DEBUG)  {
                    System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
                            read.getAlignmentStart(),
                            read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
                            read.getCigarString());
                    byte[] bases = read.getReadBases();
                    for (int k = 0; k < recalQuals.length; k++) {
                        System.out.format("%c",bases[k]);
                    }
                    System.out.println();
                    for (int k = 0; k < recalQuals.length; k++) {
                        System.out.format("%.0f ",recalQuals[k]);
                    }
                    System.out.println();
                }
            }
            // get bases of candidate haplotypes that overlap with reads
            final int trailingBases = 3;
            long readStart = read.getUnclippedStart();
            long readEnd = read.getUnclippedEnd();
            int numStartSoftClippedBases, numEndSoftClippedBases;
            // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
            // but they're actually consistent with the insertion!
            // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
            // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
            long eventStartPos = ref.getLocus().getStart();
            // compute total number of clipped bases (soft or hard clipped)
            numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
            numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
            // check for hard clips (never consider these bases):
 /*            Cigar c = read.getCigar();
            CigarElement first = c.getCigarElement(0);
            CigarElement last = c.getCigarElement(c.numCigarElements()-1);
            int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
            if (first.getOperator() == CigarOperator.H) {
                numStartHardClippedBases = first.getLength();
            }
            if (last.getOperator() == CigarOperator.H) {
                numEndHardClippedBases = last.getLength();
            }
            // correct for hard clips
            numStartSoftClippedBases -= numStartHardClippedBases;
            numEndSoftClippedBases -= numEndHardClippedBases;
            readStart += numStartHardClippedBases;
            readEnd -= numEndHardClippedBases;
 */
            // remove soft clips if necessary
            if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
                    (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
                numStartSoftClippedBases = 0;
                numEndSoftClippedBases = 0;
            }
            byte[] unclippedReadBases, unclippedReadQuals;
            int numStartClippedBases = numStartSoftClippedBases;
            int numEndClippedBases = numEndSoftClippedBases;
            unclippedReadBases = read.getReadBases();
            unclippedReadQuals = read.getBaseQualities();
            // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
            // and may leave a string of Q2 bases still hanging off the reads.
            for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
                if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
                    numStartClippedBases++;
                else
                    break;
            }
            for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
                if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
                    numEndClippedBases++;
                else
                    break;
            }
            int extraOffset = Math.abs(eventLength);
            long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
            long stop =  readEnd -numEndClippedBases  + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
            // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
            int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
            // check if start of read will be before start of reference context
            if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
                start = ref.getWindow().getStart();
            // check also if end of read will go beyond reference context
            if (stop > ref.getWindow().getStop())
                stop = ref.getWindow().getStop();
            // if there's an insertion in the read, the read stop position will be less than start + read legnth,
            // but we want to compute likelihoods in the whole region that a read might overlap
            if (stop <= start + readLength) {
                stop = start + readLength-1;
            }
            // ok, we now figured out total number of clipped bases on both ends.
            // Figure out where we want to place the haplotype to score read against
            if (DEBUG)
                System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
                        numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
            if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
                if (DEBUG)
                    System.out.println("BAD READ!!");
                for (int j=0; j < haplotypesInVC.size(); j++) {
                    readLikelihoods[readIdx][j]= 0;
                }
            }
            else {
-                byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
+                GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead());
-                        unclippedReadBases.length-numEndClippedBases);
+                if (read == null)
                    continue;
                if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
                    continue;
                }
                double[] recalQuals = null;
                byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
                        unclippedReadBases.length-numEndClippedBases);
                double[] recalCDP = null;
                if (getGapPenaltiesFromFile) {
-                    recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
+                    RecalDataManager.parseSAMRecord( read, RAC );
                    recalQuals = new double[read.getReadLength()];
                    //compute all covariate values for this read
                    final Comparable[][] covariateValues_offset_x_covar =
                            RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
                    // For each base in the read
                    for( int offset = 0; offset < read.getReadLength(); offset++ ) {
                        final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
                        Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
                        if(qualityScore == null)
                        {
                            qualityScore = performSequentialQualityCalculation( fullCovariateKey );
                            qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
                        }
                        recalQuals[offset] = -((double)qualityScore)/10.0;
                    }
                    // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
                    // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
                    if (DEBUG)  {
                        System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
                                read.getAlignmentStart(),
                                read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
                                read.getCigarString());
                        byte[] bases = read.getReadBases();
                        for (int k = 0; k < recalQuals.length; k++) {
                            System.out.format("%c",bases[k]);
                        }
                        System.out.println();
                        for (int k = 0; k < recalQuals.length; k++) {
                            System.out.format("%.0f ",recalQuals[k]);
                        }
                        System.out.println();
                    }
                }
                // get bases of candidate haplotypes that overlap with reads
                final int trailingBases = 3;
                long readStart = read.getUnclippedStart();
                long readEnd = read.getUnclippedEnd();
                int numStartSoftClippedBases, numEndSoftClippedBases;
                // see if we want to use soft clipped bases. Aligners may soft clip all bases at insertions because they don't match,
                // but they're actually consistent with the insertion!
                // Rule: if a read starts in interval [eventStart-eventLength,eventStart+1] and we are at an insertion, we'll use all soft clipped bases at the beginning.
                // Conversely, if a read ends at [eventStart,eventStart+eventLength] we'll use all soft clipped bases in the end of the read.
                long eventStartPos = ref.getLocus().getStart();
                // compute total number of clipped bases (soft or hard clipped)
                numStartSoftClippedBases = read.getAlignmentStart()- read.getUnclippedStart();
                numEndSoftClippedBases = read.getUnclippedEnd()- read.getAlignmentEnd();
                // check for hard clips (never consider these bases):
    /*            Cigar c = read.getCigar();
                CigarElement first = c.getCigarElement(0);
                CigarElement last = c.getCigarElement(c.numCigarElements()-1);
                int numStartHardClippedBases = 0, numEndHardClippedBases = 0;
                if (first.getOperator() == CigarOperator.H) {
                    numStartHardClippedBases = first.getLength();
                }
                if (last.getOperator() == CigarOperator.H) {
                    numEndHardClippedBases = last.getLength();
                }
                // correct for hard clips
                numStartSoftClippedBases -= numStartHardClippedBases;
                numEndSoftClippedBases -= numEndHardClippedBases;
                readStart += numStartHardClippedBases;
                readEnd -= numEndHardClippedBases;
      */
                // remove soft clips if necessary
                if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
                        (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
                    numStartSoftClippedBases = 0;
                    numEndSoftClippedBases = 0;
                }
                byte[] unclippedReadBases, unclippedReadQuals;
                int numStartClippedBases = numStartSoftClippedBases;
                int numEndClippedBases = numEndSoftClippedBases;
                unclippedReadBases = read.getReadBases();
                unclippedReadQuals = read.getBaseQualities();
                // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
                // and may leave a string of Q2 bases still hanging off the reads.
                for (int i=numStartSoftClippedBases; i < unclippedReadBases.length; i++) {
                    if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
                        numStartClippedBases++;
                    else
                        break;
                }
                for (int i=unclippedReadBases.length-numEndSoftClippedBases-1; i >= 0; i-- ){
                    if (unclippedReadQuals[i] < BASE_QUAL_THRESHOLD)
                        numEndClippedBases++;
                    else
                        break;
                }
                int extraOffset = Math.abs(eventLength);
                long start = Math.max(readStart + numStartClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
                long stop =  readEnd -numEndClippedBases  + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
                // Variables start and stop are coordinates (inclusive) where we want to get the haplotype from.
                int readLength = read.getReadLength()-numStartSoftClippedBases-numEndSoftClippedBases;
                // check if start of read will be before start of reference context
                if (start < ref.getWindow().getStart())// read starts before haplotype: read will have to be cut
                    start = ref.getWindow().getStart();
                // check also if end of read will go beyond reference context
                if (stop > ref.getWindow().getStop())
                    stop = ref.getWindow().getStop();
                // if there's an insertion in the read, the read stop position will be less than start + read legnth,
                // but we want to compute likelihoods in the whole region that a read might overlap
                if (stop <= start + readLength) {
                    stop = start + readLength-1;
                }
                // ok, we now figured out total number of clipped bases on both ends.
                // Figure out where we want to place the haplotype to score read against
                if (DEBUG)
                    System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n",
                            numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength());
                LinkedHashMap<Allele,Double> readEl = new LinkedHashMap<Allele,Double>();
                if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) {
                    if (DEBUG)
                        System.out.println("BAD READ!!");
                    int j=0;
                    for (Allele a: haplotypeMap.keySet()) {
                        readEl.put(a,0.0);
                        readLikelihoods[readIdx][j++] = 0.0;
                    }
                }
                else {
                    byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases,
                            unclippedReadBases.length-numEndClippedBases);
-                }
+                    byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
                            unclippedReadBases.length-numEndClippedBases);
-                if (DEBUG) {
+                    double[] recalCDP = null;
-                    System.out.println("Read bases:");
+                    if (getGapPenaltiesFromFile) {
-                    System.out.println(new String(readBases));
+                        recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
-                }
+                                unclippedReadBases.length-numEndClippedBases);
-
+                    }
                // start and stop have indices into
                for (int j=0; j < haplotypesInVC.size(); j++) {
                    Haplotype haplotype = haplotypesInVC.get(j);
                    if (stop > haplotype.getStopPosition())
                        stop = haplotype.getStopPosition();
                    if (start < haplotype.getStartPosition())
                        start = haplotype.getStartPosition();
                    // cut haplotype bases
                    long indStart = start - haplotype.getStartPosition();
                    long indStop =  stop - haplotype.getStartPosition();
                    byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
                            (int)indStart, (int)indStop);
                    if (DEBUG) {
-                        System.out.println("Haplotype to test:");
+                        System.out.println("Read bases:");
-                        System.out.println(new String(haplotypeBases));
+                        System.out.println(new String(readBases));
                    }
-                    if (useAffineGapModel) {
+                    int j=0;
                    for (Allele a: haplotypeMap.keySet()) {
                        double[] currentContextGOP = null;
                        double[] currentContextGCP = null;
-                        if (doContextDependentPenalties) {
+                        Haplotype haplotype = haplotypeMap.get(a);
                        if (stop > haplotype.getStopPosition())
                            stop = haplotype.getStopPosition();
-                           if (getGapPenaltiesFromFile) {
+                        if (start < haplotype.getStartPosition())
-                               readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
+                            start = haplotype.getStartPosition();
-                           }  else {
+                        // cut haplotype bases
-                               currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
+                        long indStart = start - haplotype.getStartPosition();
-                               currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
+                        long indStop =  stop - haplotype.getStartPosition();
-                               readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
+
-                           }
+                        byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(),
                                (int)indStart, (int)indStop);
                        if (DEBUG) {
                            System.out.println("Haplotype to test:");
                            System.out.println(new String(haplotypeBases));
                        }
- 
+
                        Double readLikelihood = 0.0;
                        if (useAffineGapModel) {
                            double[] currentContextGOP = null;
                            double[] currentContextGCP = null;
                            if (doContextDependentPenalties) {
                               if (getGapPenaltiesFromFile) {
                                   readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
                               }  else {
                                   currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop);
                                   currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop);
                                   readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
                               }
                            }
                        }
                        else
                            readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
                        readEl.put(a,readLikelihood);
                        readLikelihoods[readIdx][j++] = readLikelihood;
                    }
                    else
                        readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
                }
                indelLikelihoodMap.put(p,readEl);
            }
            readIdx++;
        }
@ -992,8 +1016,8 @@ public class PairHMMIndelErrorModel {
            }
        }
-        for (int i=0; i < haplotypesInVC.size(); i++) {
+        for (int i=0; i < numHaplotypes; i++) {
-            for (int j=i; j < haplotypesInVC.size(); j++){
+            for (int j=i; j < numHaplotypes; j++){
                // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
                // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
                //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
@ -1002,7 +1026,7 @@ public class PairHMMIndelErrorModel {
                    // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
                    // First term is approximated by Jacobian log with table lookup.
-                    if (Double.isInfinite(readLikelihoods[readIdx][i]) || Double.isInfinite(readLikelihoods[readIdx][j]))
+                    if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j]))
                        continue;
                    haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i],
                            readLikelihoods[readIdx][j]) + LOG_ONE_HALF);
@ -1013,7 +1037,7 @@ public class PairHMMIndelErrorModel {
            }
        }
-        return haplotypeLikehoodMatrix;
+        return getHaplotypeLikelihoods(haplotypeLikehoodMatrix);
    }
--- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
@ -31,9 +31,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.List;
 public class Haplotype {
    protected byte[] bases = null;
@ -108,11 +106,11 @@ public class Haplotype {
        return isReference;
    }
-    public static List<Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
+    public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
                                                               final int haplotypeSize, final int numPrefBases) {
-        List<Haplotype> haplotypeList = new ArrayList<Haplotype>();
+        LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
        Allele refAllele = null;
@ -153,11 +151,11 @@ public class Haplotype {
            String haplotypeString = new String(basesBeforeVariant) + new String(alleleBases) + new String(basesAfterVariant);
            haplotypeString = haplotypeString.substring(0,haplotypeSize);
-           haplotypeList.add(new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
+           haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus, a.isReference()));
        }
-        return haplotypeList;
+        return haplotypeMap;
    }
 }