diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 55490a1cb..cd3847b81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); - if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + if ( DEBUG ) { + logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + for ( final Haplotype remaining: trimmedHaplotypes ) { + logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); + } + } + // trim down the reads and add them to the trimmed active region final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); @@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * @return the list of haplotypes to genotype */ protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { - // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping) if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return haplotypes; } else { - return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 1fb873e81..8697833a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine { return likelihoodMatrix; } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.size() <= haplotypes.size()"}) - public List selectBestHaplotypesFromPooledLikelihoods(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + // -------------------------------------------------------------------------------- + // + // System to compute the best N haplotypes for genotyping + // + // -------------------------------------------------------------------------------- - final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = stratifiedReadMap.keySet(); - final List bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - final List haplotypesAsAlleles = new ArrayList(); - for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); } + /** + * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele + * @param map an annoying map object that moves us between the allele and haplotype representation + * @param haplotypeAsAllele the allele version of the haplotype + * @return the haplotype version, with its score incremented by 1 if its non-reference + */ + private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { + final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic + if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value + return h; + } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together + /** + * Take the best N haplotypes and return them as a list + * + * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample + * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing + * order of score (so higher score haplotypes are preferred). The N we take is determined by + * + * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) + * + * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is + * bounded by maxNumHaplotypesInPopulation as that number can grow without bound + * + * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 + * @param nSamples the number of samples used to select the haplotypes + * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples + * @return a list of N or fewer haplotypes, with the reference haplotype first + */ + private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { + final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); + Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); + final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; + final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); + final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); + if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); + return bestHaplotypes; + } - int hap1 = 0; - int hap2 = 0; - //double bestElement = Double.NEGATIVE_INFINITY; - final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 ); - while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { - double maxElement = Double.NEGATIVE_INFINITY; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) { - maxElement = haplotypeLikelihoodMatrix[iii][jjj]; - hap1 = iii; - hap2 = jjj; - } - } - } - if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } - haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; + /** + * Select the best haplotypes for genotyping the samples in stratifiedReadMap + * + * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely + * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for + * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get + * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation + * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the + * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. + * + * @param haplotypes a list of all haplotypes we're considering + * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype + * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes + * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation + */ + public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } - if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } + if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes + + // all of the haplotypes that at least one sample called as one of the most likely + final Set selectedHaplotypes = new HashSet(); + selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected + + // our annoying map from allele -> haplotype + final Map allele2Haplotype = new HashMap(); + for ( final Haplotype h : haplotypes ) { + h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes + allele2Haplotype.put(Allele.create(h, h.isReference()), h); } - if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + // for each sample, compute the most likely pair of haplotypes + for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { + // get the two most likely haplotypes under a diploid model for this sample + final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - final List bestHaplotypes = new ArrayList(); - for( final int hIndex : bestHaplotypesIndexList ) { - bestHaplotypes.add( haplotypes.get(hIndex) ); + if ( mla != null ) { // there was something to evaluate in this sample + // note that there must be at least 2 haplotypes + final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); + final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); + +// if ( DEBUG ) { +// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// } + + // add these two haplotypes to the set of haplotypes that have been selected + selectedHaplotypes.add(best); + selectedHaplotypes.add(second); + + // we've already selected all of our haplotypes, and we don't need to prune them down + if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) + break; + } + } + + // take the best N haplotypes forward, in order of the number of samples that choose them + final int nSamples = stratifiedReadMap.size(); + final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); + + if ( DEBUG ) { + logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); + for ( final Haplotype h : bestHaplotypes ) { + logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); + } } return bestHaplotypes; } - public static int findReferenceIndex( final List haplotypes ) { + /** + * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found + * @param haplotypes non-null list of haplotypes + * @return the reference haplotype + */ + private static Haplotype findReferenceHaplotype( final List haplotypes ) { for( final Haplotype h : haplotypes ) { - if( h.isReference() ) { return haplotypes.indexOf(h); } + if( h.isReference() ) return h; } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f09711094..92cf45652 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 9cd225df3..a308b4893 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a"); + HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988"); + HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a62597f2c005f373efbe398ab51a2f1")); + Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd")); + Arrays.asList("8adfa8a27a312760dab50787da595c57")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java index cf077392b..08d82281e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java @@ -53,12 +53,14 @@ import org.testng.annotations.Test; public class MostLikelyAlleleUnitTest extends BaseTest { final Allele a = Allele.create("A"); + final Allele b = Allele.create("C"); @Test public void testBasicCreation() { final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1; - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), b); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second); @@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testNotDefaultInformative() { final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2); - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.isInformative(), false); Assert.assertEquals(mla.isInformative(10), false); Assert.assertEquals(mla.isInformative(0), true); @@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoGoodSecond() { - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), null); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); @@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoAllele() { - MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index c50849a54..6ca49d3e5 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -47,11 +47,9 @@ package org.broadinstitute.sting.utils.genotyper; import net.sf.samtools.*; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood); Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood); } + + @DataProvider(name = "MostLikelyAlleleData") + public Object[][] makeMostLikelyAlleleData() { + List tests = new ArrayList(); + + final Allele a = Allele.create("A"); + final Allele c = Allele.create("C"); + final Allele g = Allele.create("G"); + + tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-100.0, 0.0, -10.0)), + c, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-20.0, 0.0, -100.0)), + c, a}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MostLikelyAlleleData") + public void testMostLikelyAllele(final List alleles, final List> perReadlikelihoods, final Allele best, final Allele second) { + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + + for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) { + final List likelihoods = perReadlikelihoods.get(readI); + + final byte[] bases = Utils.dupBytes((byte)'A', 10); + final byte[] quals = Utils.dupBytes((byte) 30, 10); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M"); + read.setReadName("readName" + readI); + + for ( int i = 0; i < alleles.size(); i++ ) { + final Allele allele = alleles.get(i); + final double likelihood = likelihoods.get(i); + map.add(read, allele, likelihood); + } + } + + final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles(); + Assert.assertEquals(mla.getMostLikelyAllele(), best); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), second); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java index e12fb546c..03a2b8077 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -50,6 +50,7 @@ public final class MostLikelyAllele { public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; final Allele mostLikely; + final Allele secondLikely; final double log10LikelihoodOfMostLikely; final double log10LikelihoodOfSecondBest; @@ -60,10 +61,11 @@ public final class MostLikelyAllele { * mostLikely should be a NO_CALL allele. * * @param mostLikely the most likely allele + * @param secondMostLikely the most likely allele after mostLikely * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) */ - public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); @@ -73,6 +75,7 @@ public final class MostLikelyAllele { throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); this.mostLikely = mostLikely; + this.secondLikely = secondMostLikely; this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; } @@ -81,6 +84,10 @@ public final class MostLikelyAllele { return mostLikely; } + public Allele getSecondMostLikelyAllele() { + return secondLikely; + } + public double getLog10LikelihoodOfMostLikely() { return log10LikelihoodOfMostLikely; } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 47be30871..8134b1257 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import java.io.PrintStream; @@ -41,6 +44,7 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { + private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class); protected List alleles; protected Map> likelihoodReadMap; @@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final GATKSAMRecord read = entry.getKey(); + final int count = ReadUtils.getMeanRepresentativeReadCount(read); + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF); + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + private static final double LOG_ONE_HALF = -Math.log10(2.0); /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. @@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap { double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; for (final Map.Entry el : alleleMap.entrySet()) { if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) @@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap { if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; mostLikelyAllele = el.getKey(); } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); prevMaxLike = el.getValue(); } } - return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike); + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); } /**