Merge pull request #166 from broadinstitute/md_hc_persample_haplotypes

Select the haplotypes we move forward for genotyping per sample, not poo...
2013-04-16 08:46:56 -07:00 · 2013-04-16 08:46:56 -07:00 · 936f4da1f6
parent bfbc23e99e 067d24957b
commit 936f4da1f6
8 changed files with 252 additions and 55 deletions
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
        // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
        Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );

-        if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
+        if ( DEBUG ) {
+            logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
+            for ( final Haplotype remaining: trimmedHaplotypes ) {
+                logger.info("  Remains: " + remaining + " cigar " + remaining.getCigar());
+            }
+        }
+

        // trim down the reads and add them to the trimmed active region
        final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     * @return the list of haplotypes to genotype
     */
    protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
-        // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping)
        if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
            return haplotypes;
        } else {
-            return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
+            return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
        }
    }

--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
 import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.haplotype.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.QualityUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
 import org.broadinstitute.sting.utils.pairhmm.*;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine {
        return likelihoodMatrix;
    }

-    @Requires({"haplotypes.size() > 0"})
-    @Ensures({"result.size() <= haplotypes.size()"})
-    public List<Haplotype> selectBestHaplotypesFromPooledLikelihoods(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
+    // --------------------------------------------------------------------------------
+    //
+    // System to compute the best N haplotypes for genotyping
+    //
+    // --------------------------------------------------------------------------------

-        final int numHaplotypes = haplotypes.size();
-        final Set<String> sampleKeySet = stratifiedReadMap.keySet();
-        final List<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
-        bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
-        final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
-        for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); }
+    /**
+     * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
+     * @param map an annoying map object that moves us between the allele and haplotype representation
+     * @param haplotypeAsAllele the allele version of the haplotype
+     * @return the haplotype version, with its score incremented by 1 if its non-reference
+     */
+    private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
+        final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
+        if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
+        return h;
+    }

-        final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together
+    /**
+     * Take the best N haplotypes and return them as a list
+     *
+     * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
+     * as it's preferred haplotype.  Takes the best N haplotypes from selectedHaplotypes in decreasing
+     * order of score (so higher score haplotypes are preferred).  The N we take is determined by
+     *
+     * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
+     *
+     * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
+     * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
+     *
+     * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
+     * @param nSamples the number of samples used to select the haplotypes
+     * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
+     * @return a list of N or fewer haplotypes, with the reference haplotype first
+     */
+    private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
+        final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
+        Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
+        final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
+        final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
+        final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
+        if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
+        return bestHaplotypes;
+    }

-        int hap1 = 0;
-        int hap2 = 0;
-        //double bestElement = Double.NEGATIVE_INFINITY;
-        final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 );
-        while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
-            double maxElement = Double.NEGATIVE_INFINITY;
-            for( int iii = 0; iii < numHaplotypes; iii++ ) {
-                for( int jjj = 0; jjj <= iii; jjj++ ) {
-                    if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
-                        maxElement = haplotypeLikelihoodMatrix[iii][jjj];
-                        hap1 = iii;
-                        hap2 = jjj;
-                    }
-                }
-            }
-            if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
-            if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
-            haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
+    /**
+     * Select the best haplotypes for genotyping the samples in stratifiedReadMap
+     *
+     * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
+     * haplotypes per sample.  What this means is that each sample computes the diploid genotype likelihoods for
+     * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
+     * one extra count for each haplotype (so hom-var haplotypes get two counts).  After performing this calculation
+     * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
+     * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
+     *
+     * @param haplotypes a list of all haplotypes we're considering
+     * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
+     * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
+     * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
+     */
+    public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
+        if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);

-            if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
-            if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
+        if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
+
+        // all of the haplotypes that at least one sample called as one of the most likely
+        final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
+        selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
+
+        // our annoying map from allele -> haplotype
+        final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
+        for ( final Haplotype h : haplotypes ) {
+            h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
+            allele2Haplotype.put(Allele.create(h, h.isReference()), h);
        }

-        if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
+        // for each sample, compute the most likely pair of haplotypes
+        for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
+            // get the two most likely haplotypes under a diploid model for this sample
+            final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();

-        final List<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
-        for( final int hIndex : bestHaplotypesIndexList ) {
-            bestHaplotypes.add( haplotypes.get(hIndex) );
+            if ( mla != null ) { // there was something to evaluate in this sample
+                // note that there must be at least 2 haplotypes
+                final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
+                final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
+
+//            if ( DEBUG ) {
+//                logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
+//            }
+
+                // add these two haplotypes to the set of haplotypes that have been selected
+                selectedHaplotypes.add(best);
+                selectedHaplotypes.add(second);
+
+                // we've already selected all of our haplotypes, and we don't need to prune them down
+                if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
+                    break;
+            }
+        }
+
+        // take the best N haplotypes forward, in order of the number of samples that choose them
+        final int nSamples = stratifiedReadMap.size();
+        final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
+
+        if ( DEBUG ) {
+            logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
+            for ( final Haplotype h : bestHaplotypes ) {
+                logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
+            }
        }
        return bestHaplotypes;
    }

-    public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
+    /**
+     * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
+     * @param haplotypes non-null list of haplotypes
+     * @return the reference haplotype
+     */
+    private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
        for( final Haplotype h : haplotypes ) {
-            if( h.isReference() ) { return haplotypes.indexOf(h); }
+            if( h.isReference() ) return h;
        }
        throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
    }
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa

    @Test
    public void testHaplotypeCallerMultiSampleComplex1() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236");
    }

    private void HCTestSymbolicVariants(String bam, String args, String md5) {
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {

    @Test
    public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a");
+        HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4");
    }

    @Test
    public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988");
+        HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a");
    }

    @Test(enabled = false) // can't annotate the rsID's yet
@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {

    @Test
    public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642");
    }

    private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {

    @Test
    public void testHaplotypeCallerNearbySmallIntervals() {
-        HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852");
+        HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685");
    }

    // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    @Test
    public void HCTestStructuralIndels() {
        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897"));
        executeTest("HCTestStructuralIndels: ", spec);
    }

@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    public void HCTestReducedBam() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("8a62597f2c005f373efbe398ab51a2f1"));
+                Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d"));
        executeTest("HC calling on a ReducedRead BAM", spec);
    }

@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    public void testReducedBamWithReadsNotFullySpanningDeletion() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
-                Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd"));
+                Arrays.asList("8adfa8a27a312760dab50787da595c57"));
        executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
    }
 }
--- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java
@ -53,12 +53,14 @@ import org.testng.annotations.Test;

 public class MostLikelyAlleleUnitTest extends BaseTest {
    final Allele a = Allele.create("A");
+    final Allele b = Allele.create("C");

    @Test
    public void testBasicCreation() {
        final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1;
-        MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
+        MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
        Assert.assertEquals(mla.getMostLikelyAllele(), a);
+        Assert.assertEquals(mla.getSecondMostLikelyAllele(), b);
        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second);

@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
    @Test
    public void testNotDefaultInformative() {
        final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2);
-        MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
+        MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
        Assert.assertEquals(mla.isInformative(), false);
        Assert.assertEquals(mla.isInformative(10), false);
        Assert.assertEquals(mla.isInformative(0), true);
@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest {

    @Test
    public void testCreationNoGoodSecond() {
-        MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY);
+        MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY);
        Assert.assertEquals(mla.getMostLikelyAllele(), a);
+        Assert.assertEquals(mla.getSecondMostLikelyAllele(), null);
        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);

@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {

    @Test
    public void testCreationNoAllele() {
-        MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
+        MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
        Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL);
        Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY);
        Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
--- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java
@ -47,11 +47,9 @@
 package org.broadinstitute.sting.utils.genotyper;

 import net.sf.samtools.*;
-import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.variant.variantcontext.Allele;
 import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
        Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood);
        Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood);
    }
+
+    @DataProvider(name = "MostLikelyAlleleData")
+    public Object[][] makeMostLikelyAlleleData() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        final Allele a = Allele.create("A");
+        final Allele c = Allele.create("C");
+        final Allele g = Allele.create("G");
+
+        tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a});
+        tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a});
+        tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c});
+        tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
+        tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
+        tests.add(new Object[]{Arrays.asList(a, c, g),
+                Arrays.asList(
+                        Arrays.asList(0.0, -10.0, -10.0),
+                        Arrays.asList(-100.0, 0.0, -10.0)),
+                c, a});
+        tests.add(new Object[]{Arrays.asList(a, c, g),
+                Arrays.asList(
+                        Arrays.asList(0.0, -10.0, -10.0),
+                        Arrays.asList(-20.0, 0.0, -100.0)),
+                c, a});
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "MostLikelyAlleleData")
+    public void testMostLikelyAllele(final List<Allele> alleles, final List<List<Double>> perReadlikelihoods, final Allele best, final Allele second) {
+        final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
+
+        for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) {
+            final List<Double> likelihoods = perReadlikelihoods.get(readI);
+
+            final byte[] bases = Utils.dupBytes((byte)'A', 10);
+            final byte[] quals = Utils.dupBytes((byte) 30, 10);
+            final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M");
+            read.setReadName("readName" + readI);
+
+            for ( int i = 0; i < alleles.size(); i++ ) {
+                final Allele allele = alleles.get(i);
+                final double likelihood = likelihoods.get(i);
+                map.add(read, allele, likelihood);
+            }
+        }
+
+        final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles();
+        Assert.assertEquals(mla.getMostLikelyAllele(), best);
+        Assert.assertEquals(mla.getSecondMostLikelyAllele(), second);
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java
@ -50,6 +50,7 @@ public final class MostLikelyAllele {
    public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2;

    final Allele mostLikely;
+    final Allele secondLikely;
    final double log10LikelihoodOfMostLikely;
    final double log10LikelihoodOfSecondBest;

@ -60,10 +61,11 @@ public final class MostLikelyAllele {
     * mostLikely should be a NO_CALL allele.
     *
     * @param mostLikely the most likely allele
+     * @param secondMostLikely the most likely allele after mostLikely
     * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele
     * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available)
     */
-    public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
+    public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
        if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null");
        if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) )
            throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely);
@ -73,6 +75,7 @@ public final class MostLikelyAllele {
            throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest);

        this.mostLikely = mostLikely;
+        this.secondLikely = secondMostLikely;
        this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely;
        this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest;
    }
@ -81,6 +84,10 @@ public final class MostLikelyAllele {
        return mostLikely;
    }

+    public Allele getSecondMostLikelyAllele() {
+        return secondLikely;
+    }
+
    public double getLog10LikelihoodOfMostLikely() {
        return log10LikelihoodOfMostLikely;
    }
--- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java
@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper;


 import com.google.java.contract.Ensures;
+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.variant.variantcontext.Allele;

 import java.io.PrintStream;
@ -41,6 +44,7 @@ import java.util.*;
 *   For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
 */
 public class PerReadAlleleLikelihoodMap {
+    private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
    protected List<Allele> alleles;
    protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;

@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap {
        return likelihoodReadMap.get(p.getRead());
    }

+    /**
+     * Get the most likely alleles estimated across all reads in this object
+     *
+     * Takes the most likely two alleles according to their diploid genotype likelihoods.  That is, for
+     * each allele i and j we compute p(D | i,j) where D is the read likelihoods.  We track the maximum
+     * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood.
+     *
+     * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele
+     * doesn't have a meaningful get best likelihood.
+     *
+     * @return a MostLikelyAllele object, or null if this map is empty
+     */
+    public MostLikelyAllele getMostLikelyDiploidAlleles() {
+        if ( isEmpty() ) return null;
+
+        int hap1 = 0;
+        int hap2 = 0;
+        double maxElement = Double.NEGATIVE_INFINITY;
+        for( int iii = 0; iii < alleles.size(); iii++ ) {
+            final Allele iii_allele = alleles.get(iii);
+            for( int jjj = 0; jjj <= iii; jjj++ ) {
+                final Allele jjj_allele = alleles.get(jjj);
+
+                double haplotypeLikelihood = 0.0;
+                for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : likelihoodReadMap.entrySet() ) {
+                    // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
+                    final GATKSAMRecord read = entry.getKey();
+                    final int count = ReadUtils.getMeanRepresentativeReadCount(read);
+                    final double likelihood_iii = entry.getValue().get(iii_allele);
+                    final double likelihood_jjj = entry.getValue().get(jjj_allele);
+                    haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF);
+
+                    // fast exit.  If this diploid pair is already worse than the max, just stop and look at the next pair
+                    if ( haplotypeLikelihood < maxElement ) break;
+                }
+
+                // keep track of the max element and associated indices
+                if ( haplotypeLikelihood > maxElement ) {
+                    hap1 = iii;
+                    hap2 = jjj;
+                    maxElement = haplotypeLikelihood;
+                }
+            }
+        }
+
+        if ( maxElement == Double.NEGATIVE_INFINITY )
+            throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong");
+
+        return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement);
+    }
+    private static final double LOG_ONE_HALF = -Math.log10(2.0);

    /**
     * Given a map from alleles to likelihoods, find the allele with the largest likelihood.
@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap {
        double maxLike = Double.NEGATIVE_INFINITY;
        double prevMaxLike = Double.NEGATIVE_INFINITY;
        Allele mostLikelyAllele = Allele.NO_CALL;
+        Allele secondMostLikely = null;

        for (final Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
            if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) )
@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap {
            if (el.getValue() > maxLike) {
                prevMaxLike = maxLike;
                maxLike = el.getValue();
+                secondMostLikely = mostLikelyAllele;
                mostLikelyAllele = el.getKey();
            } else if( el.getValue() > prevMaxLike ) {
+                secondMostLikely = el.getKey();
                prevMaxLike = el.getValue();
            }
        }

-        return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike);
+        return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike);
    }

    /**