Merge pull request #417 from broadinstitute/bt_pairhmm_api_cleanup2

Improve the PairHMM API for better FPGA integration
2013-11-14 10:47:07 -08:00 · 2013-11-14 10:47:07 -08:00 · 7f08250870
parent f22ab033f6 e40a07bb58
commit 7f08250870
10 changed files with 547 additions and 401 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -52,18 +52,10 @@ import net.sf.samtools.SAMUtils;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.QualityUtils;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.haplotype.Haplotype;
-import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
-import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
-import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
-import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
-import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
-import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
-import org.broadinstitute.sting.utils.pairhmm.PairHMM;
+import org.broadinstitute.sting.utils.pairhmm.*;
 import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
 import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -88,7 +80,7 @@ public class LikelihoodCalculationEngine {
    private final PairHMM.HMM_IMPLEMENTATION hmmType;
    private final boolean noFpga;

-    private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
+    private final ThreadLocal<PairHMM> pairHMMThreadLocal = new ThreadLocal<PairHMM>() {
        @Override
        protected PairHMM initialValue() {
            switch (hmmType) {
@ -109,6 +101,8 @@ public class LikelihoodCalculationEngine {
            }
        }
    };
+//    Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException
+//    private final PairHMM pairHMM = pairHMMThreadLocal.get();

    private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
    private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
@ -173,36 +167,145 @@ public class LikelihoodCalculationEngine {
        if ( likelihoodsStream != null ) likelihoodsStream.close();
    }

-    /**
-     * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
-     *
-     * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
-     * against the set of haplotypes
-     *
-     * @param haplotypes a non-null list of haplotypes
-     * @param perSampleReadList a mapping from sample -> reads
-     */
-    private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
-        int X_METRIC_LENGTH = 0;
-        for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
-            for( final GATKSAMRecord read : sample.getValue() ) {
-                final int readLength = read.getReadLength();
-                if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
-            }
+    private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
+        if ( WRITE_LIKELIHOODS_TO_FILE ) {
+            likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
+                    haplotype.getBaseString(),
+                    new String(processedRead.getReadBases() ),
+                    SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
+                    SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
+                    SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
+                    SAMUtils.phredToFastq(constantGCP),
+                    log10l);
        }
-        int Y_METRIC_LENGTH = 0;
-        for( final Haplotype h : haplotypes ) {
-            final int haplotypeLength = h.getBases().length;
-            if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
-        }
-
-        // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
-        pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
    }

+    private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
+        final int numHaplotypes = haplotypes.size();
+        final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
+        for ( final Haplotype haplotype : haplotypes ) {
+            final Allele allele = Allele.create(haplotype, true);
+            alleleMap.put(allele, haplotype);
+        }
+        return alleleMap;
+    }
+
+    private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
+        final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
+        for (GATKSAMRecord read: reads){
+            byte [] GCPArray = new byte[read.getReadBases().length];
+            Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
+            GCPArrayMap.put(read, GCPArray);
+        }
+        return GCPArrayMap;
+    }
+
+    private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
+        for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
+            readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
+            readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
+            readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
+            readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
+        }
+    }
+
+    /**
+     * Pre-processing of the reads to be evaluated at the current location from the current sample.
+     * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.
+     * Modified copies of reads are packed into a new list, while original reads are retained for downstream use
+     *
+     * @param reads The original list of unmodified reads
+     * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
+     */
+    private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
+        List<GATKSAMRecord> processedReads = new LinkedList<>();
+        for ( GATKSAMRecord read : reads ) {
+
+            final byte[] readBases = read.getReadBases();
+
+            // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
+            final byte[] readQuals = read.getBaseQualities().clone();
+            final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
+            final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
+
+            applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
+            capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals);
+
+            // Create a new copy of the read and sets its base qualities to the modified versions.
+            // Pack this into a new list for return
+            final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals);
+            processedReads.add(processedRead);
+        }
+        return processedReads;
+    }
+
+    /**
+     * Post-processing of the read/allele likelihoods.
+     *
+     * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
+     * We wish to return a map containing the original, unmodified reads.
+     *
+     * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
+     * This protects us from the case where the assembly has produced haplotypes
+     * that are very divergent from reference, but are supported by only one read.  In effect
+     * we capping how badly scoring the reference can be for any read by the chance that the read
+     * itself just doesn't belong here
+     *
+     * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
+     * @param reads Our original, unmodified reads
+     * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
+     * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
+     *
+     * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
+     */
+    private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
+
+        // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
+        final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
+
+        Allele refAllele = null;
+        final int numReads = reads.size();
+        for (int readIndex = 0; readIndex < numReads; readIndex++) {
+
+            // Get the original and quality-modified read from their respective lists
+            // Note that this requires both lists to have reads in the same order
+            final GATKSAMRecord originalRead = reads.get(readIndex);
+            final GATKSAMRecord processedRead = processedReads.get(readIndex);
+
+            // keep track of the reference likelihood and the best non-ref likelihood
+            double refLog10l = Double.NEGATIVE_INFINITY;
+            double bestNonReflog10L = Double.NEGATIVE_INFINITY;
+
+            for ( Allele allele : alleleHaplotypeMap.keySet() ) {
+                final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
+                final Haplotype haplotype = alleleHaplotypeMap.get(allele);
+                if ( haplotype.isNonReference() )
+                    bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
+                else {
+                    refAllele = allele;
+                    refLog10l = log10l;
+                }
+                writeDebugLikelihoods(processedRead, haplotype, log10l);
+
+                // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
+                processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
+            }
+
+            // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
+            // mismapping rate.  This protects us from the case where the assembly has produced haplotypes
+            // that are very divergent from reference, but are supported by only one read.  In effect
+            // we capping how badly scoring the reference can be for any read by the chance that the read
+            // itself just doesn't belong here
+            final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
+            if ( refLog10l < (worstRefLog10Allowed) ) {
+                processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed);
+            }
+        }
+        return processedReadAlleleLikelihoodMap;
+    }
+
+
    public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
-        // configure the HMM
-        initializePairHMM(haplotypes, perSampleReadList);

        // Add likelihoods for each sample's reads to our stratifiedReadMap
        final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
@ -218,137 +321,22 @@ public class LikelihoodCalculationEngine {
    }

    private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
-        // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
-        final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null;
-        final Vector<GATKSAMRecord> batchedReads = new Vector<GATKSAMRecord>(reads.size());
-        final int numHaplotypes = haplotypes.size();
-        final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
-        Allele refAllele = null;
-        for ( final Haplotype haplotype : haplotypes ) {
-            final Allele allele = Allele.create(haplotype, true);
-            alleleVersions.put(haplotype, allele);
-            if ( haplotype.isReference() ) refAllele = allele;
-        }

-        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
-        for( final GATKSAMRecord read : reads ) {
+        // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
+        List<GATKSAMRecord> processedReads = modifyReadQualities(reads);

-            final byte[] readBases = read.getReadBases();
-            final byte[] overallGCP = new byte[read.getReadLength()];
-            Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
+        // Get alleles corresponding to our haplotypees
+        Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);

-            // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
-            final byte[] readQuals = read.getBaseQualities().clone();
-            final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
-            final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
+        // Get an array containing the constantGCP for each read in our modified read list
+        Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);

-            applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
+        // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
+        PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap);

-            for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
-                readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
-                readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
-                readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
-                readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
-            }
+        // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate

-            if ( batchPairHMM != null ) {
-                batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
-                batchedReads.add(read);
-                continue;
-            }
-
-            // keep track of the reference likelihood and the best non-ref likelihood
-            double refLog10l = Double.NEGATIVE_INFINITY;
-            double bestNonReflog10L = Double.NEGATIVE_INFINITY;
-
-            // iterate over all haplotypes, calculating the likelihood of the read for each haplotype
-            for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-                final Haplotype haplotype = haplotypes.get(jjj);
-                final byte[] nextHaplotypeBases = (jjj == numHaplotypes - 1) ? null : haplotypes.get(jjj+1).getBases();
-                final boolean isFirstHaplotype = jjj == 0;
-                final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
-                        readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
-
-                if ( WRITE_LIKELIHOODS_TO_FILE ) {
-                    likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
-                            haplotype.getBaseString(),
-                            new String(readBases),
-                            SAMUtils.phredToFastq(readQuals),
-                            SAMUtils.phredToFastq(readInsQuals),
-                            SAMUtils.phredToFastq(readDelQuals),
-                            SAMUtils.phredToFastq(overallGCP),
-                            log10l);
-                }
-
-                if ( haplotype.isNonReference() )
-                    bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
-                else
-                    refLog10l = log10l;
-
-                perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
-            }
-
-            // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
-            // mismapping rate.  This protects us from the case where the assembly has produced haplotypes
-            // that are very divergent from reference, but are supported by only one read.  In effect
-            // we capping how badly scoring the reference can be for any read by the chance that the read
-            // itself just doesn't belong here
-            final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
-            if ( refLog10l < (worstRefLog10Allowed) ) {
-                perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
-            }
-        }
-
-        if ( batchPairHMM != null ) {
-            for( final GATKSAMRecord read : batchedReads ) {
-                double refLog10l = Double.NEGATIVE_INFINITY;
-                double bestNonReflog10L = Double.NEGATIVE_INFINITY;
-                final double[] likelihoods = batchPairHMM.batchGetResult();
-                for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
-                    final Haplotype haplotype = haplotypes.get(jjj);
-                    final double log10l = likelihoods[jjj];
-
-                    if ( WRITE_LIKELIHOODS_TO_FILE ) {
-                        final byte[] overallGCP = new byte[read.getReadLength()];
-                        Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
-                        // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
-                        final byte[] readQuals = read.getBaseQualities().clone();
-                        final byte[] readInsQuals = read.getBaseInsertionQualities();
-                        final byte[] readDelQuals = read.getBaseDeletionQualities();
-                        for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
-                            readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
-                            //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
-                            //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
-                            // TODO -- why is Q18 hard-coded here???
-                            readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
-                        }
-                        likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
-                                haplotype.getBaseString(),
-                                new String(read.getReadBases()),
-                                SAMUtils.phredToFastq(readQuals),
-                                SAMUtils.phredToFastq(readInsQuals),
-                                SAMUtils.phredToFastq(readDelQuals),
-                                SAMUtils.phredToFastq(overallGCP),
-                                log10l);
-                    }
-
-                    if ( haplotype.isNonReference() )
-                        bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
-                    else
-                        refLog10l = log10l;
-
-
-                    perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
-                }
-
-                final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
-                if ( refLog10l < (worstRefLog10Allowed) ) {
-                    perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
-                }
-            }
-        }
-
-        return perReadAlleleLikelihoodMap;
+        return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap);
    }

    @Requires({"alleleOrdering.size() > 0"})
@ -421,125 +409,125 @@ public class LikelihoodCalculationEngine {
    // System to compute the best N haplotypes for genotyping
    //
    // --------------------------------------------------------------------------------
-
-    /**
-     * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
-     * @param map an annoying map object that moves us between the allele and haplotype representation
-     * @param haplotypeAsAllele the allele version of the haplotype
-     * @return the haplotype version, with its score incremented by 1 if its non-reference
-     */
-    private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
-        final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
-        if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
-        return h;
-    }
-
-    /**
-     * Take the best N haplotypes and return them as a list
-     *
-     * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
-     * as it's preferred haplotype.  Takes the best N haplotypes from selectedHaplotypes in decreasing
-     * order of score (so higher score haplotypes are preferred).  The N we take is determined by
-     *
-     * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
-     *
-     * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
-     * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
-     *
-     * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
-     * @param nSamples the number of samples used to select the haplotypes
-     * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
-     * @return a list of N or fewer haplotypes, with the reference haplotype first
-     */
-    private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
-        final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
-        Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
-        final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
-        final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
-        final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
-        if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
-        return bestHaplotypes;
-    }
-
-    /**
-     * Select the best haplotypes for genotyping the samples in stratifiedReadMap
-     *
-     * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
-     * haplotypes per sample.  What this means is that each sample computes the diploid genotype likelihoods for
-     * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
-     * one extra count for each haplotype (so hom-var haplotypes get two counts).  After performing this calculation
-     * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
-     * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
-     *
-     * @param haplotypes a list of all haplotypes we're considering
-     * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
-     * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
-     * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
-     */
-    public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
-        if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
-
-        if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
-
-        // all of the haplotypes that at least one sample called as one of the most likely
-        final Set<Haplotype> selectedHaplotypes = new HashSet<>();
-        selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
-
-        // our annoying map from allele -> haplotype
-        final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
-        for ( final Haplotype h : haplotypes ) {
-            h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
-            allele2Haplotype.put(Allele.create(h, h.isReference()), h);
-        }
-
-        // for each sample, compute the most likely pair of haplotypes
-        for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
-            // get the two most likely haplotypes under a diploid model for this sample
-            final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
-
-            if ( mla != null ) { // there was something to evaluate in this sample
-                // note that there must be at least 2 haplotypes
-                final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
-                final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
-
-//            if ( DEBUG ) {
-//                logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
+//
+//    /**
+//     * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
+//     * @param map an annoying map object that moves us between the allele and haplotype representation
+//     * @param haplotypeAsAllele the allele version of the haplotype
+//     * @return the haplotype version, with its score incremented by 1 if its non-reference
+//     */
+//    private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
+//        final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
+//        if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
+//        return h;
+//    }
+//
+//    /**
+//     * Take the best N haplotypes and return them as a list
+//     *
+//     * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
+//     * as it's preferred haplotype.  Takes the best N haplotypes from selectedHaplotypes in decreasing
+//     * order of score (so higher score haplotypes are preferred).  The N we take is determined by
+//     *
+//     * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
+//     *
+//     * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
+//     * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
+//     *
+//     * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
+//     * @param nSamples the number of samples used to select the haplotypes
+//     * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
+//     * @return a list of N or fewer haplotypes, with the reference haplotype first
+//     */
+//    private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
+//        final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
+//        Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
+//        final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
+//        final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
+//        final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
+//        if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
+//        return bestHaplotypes;
+//    }
+//
+//    /**
+//     * Select the best haplotypes for genotyping the samples in stratifiedReadMap
+//     *
+//     * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
+//     * haplotypes per sample.  What this means is that each sample computes the diploid genotype likelihoods for
+//     * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
+//     * one extra count for each haplotype (so hom-var haplotypes get two counts).  After performing this calculation
+//     * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
+//     * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
+//     *
+//     * @param haplotypes a list of all haplotypes we're considering
+//     * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
+//     * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
+//     * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
+//     */
+//    public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
+//        if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
+//
+//        if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
+//
+//        // all of the haplotypes that at least one sample called as one of the most likely
+//        final Set<Haplotype> selectedHaplotypes = new HashSet<>();
+//        selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
+//
+//        // our annoying map from allele -> haplotype
+//        final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
+//        for ( final Haplotype h : haplotypes ) {
+//            h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
+//            allele2Haplotype.put(Allele.create(h, h.isReference()), h);
+//        }
+//
+//        // for each sample, compute the most likely pair of haplotypes
+//        for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
+//            // get the two most likely haplotypes under a diploid model for this sample
+//            final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
+//
+//            if ( mla != null ) { // there was something to evaluate in this sample
+//                // note that there must be at least 2 haplotypes
+//                final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
+//                final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
+//
+////            if ( DEBUG ) {
+////                logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
+////            }
+//
+//                // add these two haplotypes to the set of haplotypes that have been selected
+//                selectedHaplotypes.add(best);
+//                selectedHaplotypes.add(second);
+//
+//                // we've already selected all of our haplotypes, and we don't need to prune them down
+//                if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
+//                    break;
 //            }
-
-                // add these two haplotypes to the set of haplotypes that have been selected
-                selectedHaplotypes.add(best);
-                selectedHaplotypes.add(second);
-
-                // we've already selected all of our haplotypes, and we don't need to prune them down
-                if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
-                    break;
-            }
-        }
-
-        // take the best N haplotypes forward, in order of the number of samples that choose them
-        final int nSamples = stratifiedReadMap.size();
-        final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
-
-        if ( DEBUG ) {
-            logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
-            for ( final Haplotype h : bestHaplotypes ) {
-                logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
-            }
-        }
-        return bestHaplotypes;
-    }
-
-    /**
-     * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
-     * @param haplotypes non-null list of haplotypes
-     * @return the reference haplotype
-     */
-    private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
-        for( final Haplotype h : haplotypes ) {
-            if( h.isReference() ) return h;
-        }
-        throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
-    }
+//        }
+//
+//        // take the best N haplotypes forward, in order of the number of samples that choose them
+//        final int nSamples = stratifiedReadMap.size();
+//        final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
+//
+//        if ( DEBUG ) {
+//            logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
+//            for ( final Haplotype h : bestHaplotypes ) {
+//                logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
+//            }
+//        }
+//        return bestHaplotypes;
+//    }
+//
+//    /**
+//     * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
+//     * @param haplotypes non-null list of haplotypes
+//     * @return the reference haplotype
+//     */
+//    private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
+//        for( final Haplotype h : haplotypes ) {
+//            if( h.isReference() ) return h;
+//        }
+//        throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
+//    }

    // --------------------------------------------------------------------------------
    //
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -48,11 +48,11 @@ package org.broadinstitute.sting.gatk.walkers.indels;

 import com.google.java.contract.Ensures;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.utils.haplotype.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.clipping.ReadClipper;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
 import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
 import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
 import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
@ -65,6 +65,7 @@ import org.broadinstitute.variant.variantcontext.Allele;

 import java.util.Arrays;
 import java.util.LinkedHashMap;
+import java.util.LinkedList;
 import java.util.Map;

 //import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM;
@ -206,6 +207,39 @@ public class PairHMMIndelErrorModel {
        }
    }

+    private LinkedHashMap<Allele, Haplotype> trimHaplotypes(final LinkedHashMap<Allele, Haplotype> haplotypeMap,
+                                                            long startLocationInRefForHaplotypes,
+                                                            long stopLocationInRefForHaplotypes,
+                                                            final ReferenceContext ref){
+
+        final LinkedHashMap<Allele, Haplotype> trimmedHaplotypeMap = new LinkedHashMap<>();
+        for (final Allele a: haplotypeMap.keySet()) {
+
+            final Haplotype haplotype = haplotypeMap.get(a);
+
+            if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
+                stopLocationInRefForHaplotypes = haplotype.getStopPosition();
+
+            if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
+                startLocationInRefForHaplotypes = haplotype.getStartPosition();
+            else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
+                startLocationInRefForHaplotypes = haplotype.getStopPosition();
+
+            final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
+            final long indStop =  stopLocationInRefForHaplotypes - haplotype.getStartPosition();
+
+            if (DEBUG)
+                System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
+                        indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
+
+            // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
+            final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
+            final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference());
+            trimmedHaplotypeMap.put(a, trimmedHaplotype);
+        }
+        return trimmedHaplotypeMap;
+    }
+

    public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
                                                                        final LinkedHashMap<Allele, Haplotype> haplotypeMap,
@ -231,6 +265,8 @@ public class PairHMMIndelErrorModel {
                                                                          final int[] readCounts) {
        final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];

+        final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
+        final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
        int readIdx=0;
        for (PileupElement p: pileup) {
            // > 1 when the read is a consensus read representing multiple independent observations
@ -369,86 +405,30 @@ public class PairHMMIndelErrorModel {
                        baseDeletionQualities = contextLogGapOpenProbabilities;
                    }

-                    byte[] currentHaplotypeBases = null;
-                    boolean firstHap = true;
-                    double readLikelihood;
-                    Allele currentAllele = null;
-                    for (Allele a: haplotypeMap.keySet()) {
+                    // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
+                    final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
+                    readList.add(processedRead);

-                        Haplotype haplotype = haplotypeMap.get(a);
+                    // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
+                    readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities);

-                        if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
-                            stopLocationInRefForHaplotypes = haplotype.getStopPosition();
+                    // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the approprate genomic locations
+                    final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);

-                        if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
-                            startLocationInRefForHaplotypes = haplotype.getStartPosition();
-                        else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
-                            startLocationInRefForHaplotypes = haplotype.getStopPosition();
+                    // Get the likelihoods for our clipped read against each of our trimmed haplotypes.
+                    final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap);

-                        final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
-                        final long indStop =  stopLocationInRefForHaplotypes - haplotype.getStartPosition();
-
-
-                        if (DEBUG)
-                            System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
-                                    indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
-
-                        // peak at the next haplotype in the list
-                        final byte[] nextHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
-                        // process the current haplotype in the list
-                        if (currentHaplotypeBases != null) {
-                            // it's possible that the indel starts at the last base of the haplotypes
-                            if ( currentHaplotypeBases.length == 0 ) {
-                                readLikelihood = -Double.MAX_VALUE;
-                            } else {
-                                if (firstHap) {
-                                    //no need to reallocate arrays for each new haplotype, as length won't change
-                                    pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
-                                    firstHap = false;
-                                }
-
-                                readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
-                                        baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, nextHaplotypeBases);
-                            }
-
-                            if (DEBUG) {
-                                System.out.println("H:"+new String(currentHaplotypeBases));
-                                System.out.println("R:"+new String(readBases));
-                                System.out.format("L:%4.2f\n",readLikelihood);
-                            }
-
-                            perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
-                            readLikelihoods[readIdx][j++] = readLikelihood;
-                        }
-                        // update the current haplotype
-                        currentHaplotypeBases = nextHaplotypeBases;
-                        currentAllele = a;
-                    }
-                    // process the final haplotype
-                    if (currentHaplotypeBases != null) {
-                        // it's possible that the indel starts at the last base of the haplotypes
-                        if ( currentHaplotypeBases.length == 0 ) {
-                            readLikelihood = -Double.MAX_VALUE;
-                        } else {
-                            if (firstHap) {
-                                //no need to reallocate arrays for each new haplotype, as length won't change
-                                pairHMM.initialize(readBases.length, currentHaplotypeBases.length);
-                                firstHap = false;
-                            }
-                            // there is no next haplotype, so pass null for nextHaplotypeBases.
-                            readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, readBases, readQuals,
-                                    baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap, null);
-                        }
-
-                        if (DEBUG) {
-                            System.out.println("H:"+new String(currentHaplotypeBases));
-                            System.out.println("R:"+new String(readBases));
-                            System.out.format("L:%4.2f\n",readLikelihood);
-                        }
-
-                        perReadAlleleLikelihoodMap.add(p, currentAllele, readLikelihood);
+                    // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
+                    for (Allele a: trimmedHaplotypeMap.keySet()){
+                        double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a);
+                        perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
                        readLikelihoods[readIdx][j++] = readLikelihood;
                    }
+                    // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
+                    readList.remove(processedRead);
+
+                    // The same is true for the read/GCP-array map
+                    readGCPArrayMap.remove(processedRead);
                }
            }
            readIdx++;
@ -472,16 +452,16 @@ public class PairHMMIndelErrorModel {
        return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
    }

-    private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
-        if (b1.length != b2.length)
-            return 0; // sanity check
-
-        for (int i=0; i < b1.length; i++ ){
-            if ( b1[i]!= b2[i] )
-                return i;
-        }
-        return b1.length;
-    }
+//    private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
+//        if (b1.length != b2.length)
+//            return 0; // sanity check
+//
+//        for (int i=0; i < b1.length; i++ ){
+//            if ( b1[i]!= b2[i] )
+//                return i;
+//        }
+//        return b1.length;
+//    }

    private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
        final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java
@ -49,6 +49,7 @@ package org.broadinstitute.sting.utils.pairhmm;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import org.broadinstitute.sting.utils.QualityUtils;
+
 import java.util.Arrays;

 /**
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java
@ -47,12 +47,16 @@
 package org.broadinstitute.sting.utils.pairhmm;

 import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;

 import java.io.File;
 import java.lang.reflect.Field;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;

 public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
    private static class HmmInput {
@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
        public byte[] deletionGOP;
        public byte[] overallGCP;
        public List<Haplotype> haplotypes;
-    };
+    }

    public static class ResultQueue {
        private int offset;
        private List<double[]> batchResults;

        public ResultQueue() {
-            batchResults = new LinkedList<double[]>();
+            batchResults = new LinkedList<>();
            offset = 0;
        }

@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
    final static String libName = "gmvhdl_gatk_hmm";

    private static boolean loaded = false;
-    private List<HmmInput> batchRequests = new LinkedList<HmmInput>();
+    private List<HmmInput> batchRequests = new LinkedList<>();
    private ResultQueue resultQueue = new ResultQueue();

    static public boolean isAvailable() {
@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
        return results;
    }

+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){
+
+        // initialize the pairHMM if necessary
+        if (! initialized) {
+            int readMaxLength = findMaxReadLength(reads);
+            int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
+            initialize(readMaxLength, haplotypeMaxLength);
+        }
+
+        // Pass the read bases/quals, and the haplotypes as a list into the HMM
+        performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap);
+
+        // Get the log10-likelihoods for each read/haplotype ant pack into the results map
+        final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
+        collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
+
+        return likelihoodMap;
+    }
+
+    private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) {
+        for(final GATKSAMRecord read : reads){
+            final double[] likelihoods = batchGetResult();
+            int jjj = 0;
+            for (Allele allele : alleleHaplotypeMap.keySet()){
+                final double log10l = likelihoods[jjj];
+                likelihoodMap.add(read, allele, log10l);
+                jjj++;
+            }
+        }
+    }
+
+    private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) {
+        final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
+        for(final GATKSAMRecord read : reads){
+            final byte[] readBases = read.getReadBases();
+            final byte[] readQuals = read.getBaseQualities();
+            final byte[] readInsQuals = read.getBaseInsertionQualities();
+            final byte[] readDelQuals = read.getBaseDeletionQualities();
+            final byte[] overallGCP = GCPArrayMap.get(read);
+
+            batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
+        }
+    }
+
+
    protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
                                                                  final byte[] readBases,
                                                                  final byte[] readQuals,
@ -196,6 +249,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
        return 0.0;
    }

+    private List<Haplotype> getHaplotypeList(Map<Allele, Haplotype> alleleHaplotypeMap){
+        final List<Haplotype> haplotypeList = new LinkedList<>();
+        for (Allele a : alleleHaplotypeMap.keySet()){
+             haplotypeList.add(alleleHaplotypeMap.get(a));
+        }
+        return haplotypeList;
+    }
+
    private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) {
        double[] results = null;
        int n = dequeueRequirement(haplotypeBases.length, readBases.length);
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java
@ -70,17 +70,6 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
    private static final int deletionToDeletion = 5;


-    /**
-     * {@inheritDoc}
-     */
-    @Override
-    public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
-        super.initialize(readMaxLength, haplotypeMaxLength);
-
-        transition = new double[paddedMaxReadLength][6];
-        prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
-    }
-
    /**
     * {@inheritDoc}
     */
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap {
     * @return a map from each allele to a list of reads that 'support' the allele
     */
    protected Map<Allele,List<GATKSAMRecord>> getAlleleStratifiedReadMap() {
-        final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
+        final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<>(alleles.size());
        for ( final Allele allele : alleles )
            alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());

@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap {
     /**
     * Does the current map contain the key associated with a particular SAM record in pileup?
     * @param p                 Pileup element
-     * @return
+     * @return true if the map contains pileup element, else false
     */
    public boolean containsPileupElement(final PileupElement p) {
        return likelihoodReadMap.containsKey(p.getRead());
@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap {
        return likelihoodReadMap.keySet();
    }

-    public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
-        return likelihoodReadMap.values();
-    }
+//    public Collection<Map<Allele,Double>> getLikelihoodMapValues() {
+//        return likelihoodReadMap.values();
+//    }

    public int getNumberOfStoredElements() {
        return likelihoodReadMap.size();
@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap {
        return likelihoodReadMap.get(p.getRead());
    }

+
+    /**
+     * Get the log10 likelihood associated with an individual read/allele
+     *
+     * @param read the read whose likelihood we want
+     * @param allele the allele whose likelihood we want
+     * @return the log10 likelihood that this read matches this allele
+     */
+    public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){
+        if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read))
+            return 0.0;
+
+        return likelihoodReadMap.get(read).get(allele);
+    }
+
    /**
     * Get the most likely alleles estimated across all reads in this object
     *
--- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java
@ -85,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM {
            Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
            Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
        }
-
-        transition = new double[paddedMaxReadLength][6];
-        prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java
@ -26,10 +26,6 @@
 package org.broadinstitute.sting.utils.pairhmm;

 import com.google.java.contract.Requires;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.utils.MathUtils;
-
-import java.util.Arrays;

 /**
 * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix
@ -58,6 +54,9 @@ abstract class N2MemoryPairHMM extends PairHMM {
        matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
        insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
        deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
+
+        transition = new double[paddedMaxReadLength][6];
+        prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm;
 import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.haplotype.Haplotype;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.variant.variantcontext.Allele;

 import java.util.Arrays;
-
+import java.util.List;
+import java.util.Map;
 /**
 * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
 *
@ -59,7 +64,7 @@ public abstract class PairHMM {
    protected int maxHaplotypeLength, maxReadLength;
    protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
    protected int paddedReadLength, paddedHaplotypeLength;
-    private boolean initialized = false;
+    protected boolean initialized = false;

    // only used for debugging purposes
    protected boolean doNotUseTristateCorrection = false;
@ -73,7 +78,7 @@ public abstract class PairHMM {
     * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
     * @param readMaxLength the max length of reads we want to use with this PairHMM
     */
-    public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
+    protected void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
        if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
        if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);

@ -90,6 +95,79 @@ public abstract class PairHMM {
        initialized = true;
    }

+    protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
+        int listMaxReadLength = 0;
+        for(GATKSAMRecord read : reads){
+            final int readLength = read.getReadLength();
+            if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; }
+        }
+        return listMaxReadLength;
+    }
+
+    protected int findMaxHaplotypeLength(final Map<Allele, Haplotype> haplotypeMap) {
+        int listMaxHaplotypeLength = 0;
+        for( final Allele a: haplotypeMap.keySet() ) {
+            final Haplotype h = haplotypeMap.get(a);
+            final int haplotypeLength = h.getBases().length;
+            if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; }
+        }
+        return listMaxHaplotypeLength;
+    }
+
+    /**
+     *  Given a list of reads and haplotypes, for every read compute the total probability of said read arising from
+     *  each haplotype given base substitution, insertion, and deletion probabilities.
+     *
+     * @param reads the list of reads
+     * @param alleleHaplotypeMap the list of haplotypes
+     * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read.
+     * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of
+     *          said read coming from the said haplotype under the provided error model
+     */
+    public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap) {
+
+        // (re)initialize the pairHMM only if necessary
+        final int readMaxLength = findMaxReadLength(reads);
+        final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
+        if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); }
+
+        final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
+        for(GATKSAMRecord read : reads){
+            final byte[] readBases = read.getReadBases();
+            final byte[] readQuals = read.getBaseQualities();
+            final byte[] readInsQuals = read.getBaseInsertionQualities();
+            final byte[] readDelQuals = read.getBaseDeletionQualities();
+            final byte[] overallGCP = GCPArrayMap.get(read);
+
+            // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
+            byte[] currentHaplotypeBases = null;
+            boolean isFirstHaplotype = true;
+            Allele currentAllele = null;
+            double log10l;
+            for (final Allele allele : alleleHaplotypeMap.keySet()){
+                final Haplotype haplotype = alleleHaplotypeMap.get(allele);
+                final byte[] nextHaplotypeBases = haplotype.getBases();
+                if (currentHaplotypeBases != null) {
+                     log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
+                            readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
+                    likelihoodMap.add(read, currentAllele, log10l);
+                }
+                // update the current haplotype
+                currentHaplotypeBases = nextHaplotypeBases;
+                currentAllele = allele;
+            }
+            // process the final haplotype
+            if (currentHaplotypeBases != null) {
+
+                // there is no next haplotype, so pass null for nextHaplotypeBases.
+                log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
+                        readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
+                likelihoodMap.add(read, currentAllele, log10l);
+            }
+        }
+        return likelihoodMap;
+    }
+
    /**
     * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
     * probabilities.
@ -110,7 +188,7 @@ public abstract class PairHMM {
     *                          parameters are the same, and only the haplotype bases are changing underneath us
     * @return the log10 probability of read coming from the haplotype under the provided error model
     */
-    public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
+    protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
                                                                  final byte[] readBases,
                                                                  final byte[] readQuals,
                                                                  final byte[] insertionGOP,
@ -118,6 +196,7 @@ public abstract class PairHMM {
                                                                  final byte[] overallGCP,
                                                                  final boolean recacheReadValues,
                                                                  final byte[] nextHaploytpeBases) {
+
        if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
        if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
        if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength);
--- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord {
    }

    public static GATKSAMRecord createRandomRead(int length) {
-        List<CigarElement> cigarElements = new LinkedList<CigarElement>();
+        List<CigarElement> cigarElements = new LinkedList<>();
        cigarElements.add(new CigarElement(length, CigarOperator.M));
        Cigar cigar = new Cigar(cigarElements);
        return ArtificialSAMUtils.createArtificialRead(cigar);
@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord {
     * @return True if an attribute has been set for this key.
     */
    public boolean containsTemporaryAttribute(Object key) {
-        if(temporaryAttributes != null) {
-            return temporaryAttributes.containsKey(key);
-        }
-        return false;
+        return temporaryAttributes != null && temporaryAttributes.containsKey(key);
    }

    /**
@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord {
     */
    public Object setTemporaryAttribute(Object key, Object value) {
        if(temporaryAttributes == null) {
-            temporaryAttributes = new HashMap<Object, Object>();
+            temporaryAttributes = new HashMap<>();
        }
        return temporaryAttributes.put(key, value);
    }
@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord {
        return emptyRead;
    }

+    /**
+     * Creates a new GATKSAMRecord with the source read's header, read group and mate
+     * information, but with the following fields set to user-supplied values:
+     *  - Read Bases
+     *  - Base Qualities
+     *  - Base Insertion Qualities
+     *  - Base Deletion Qualities
+     *
+     *  Cigar string is empty (not-null)
+     *
+     * Use this method if you want to create a new GATKSAMRecord based on
+     * another GATKSAMRecord, but with modified bases and qualities
+     *
+     * @param read a read to copy the header from
+     * @param readBases an array containing the new bases you wish use in place of the originals
+     * @param baseQualities an array containing the new base qualities you wish use in place of the originals
+     * @param baseInsertionQualities an array containing the new base insertion qaulities
+     * @param baseDeletionQualities an array containing the new base deletion qualities
+     * @return a read with modified bases and qualities, safe for the GATK
+     */
+    public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read,
+                                                          final byte[] readBases,
+                                                          final byte[] baseQualities,
+                                                          final byte[] baseInsertionQualities,
+                                                          final byte[] baseDeletionQualities) {
+        if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length )
+            throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length
+                                                + " vs Base Q's:" + baseQualities.length
+                                                + " vs Insert Q's:" + baseInsertionQualities.length
+                                                + " vs Delete Q's:" + baseDeletionQualities.length);
+
+        final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read);
+        processedRead.setReadBases(readBases);
+        processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION);
+        processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION);
+        processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION);
+
+        return processedRead;
+    }
+
    /**
     * Shallow copy of everything, except for the attribute list and the temporary attributes. 
     * A new list of the attributes is created for both, but the attributes themselves are copied by reference.  
@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord {
    public Object clone() throws CloneNotSupportedException {
        final GATKSAMRecord clone = (GATKSAMRecord) super.clone();
        if (temporaryAttributes != null) {
-            clone.temporaryAttributes = new HashMap<Object, Object>();
+            clone.temporaryAttributes = new HashMap<>();
            for (Object attribute : temporaryAttributes.keySet())
                clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute));
        }