From 067d24957bdc8b283ea39b7d8693dd9ebb37188e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 16 Apr 2013 09:30:06 -0400 Subject: [PATCH] Select the haplotypes we move forward for genotyping per sample, not pooled -- The previous algorithm would compute the likelihood of each haplotype pooled across samples. This has a tendency to select "consensus" haplotypes that are reasonably good across all samples, while missing the true haplotypes that each sample likes. The new algorithm computes instead the most likely pair of haplotypes among all haplotypes for each sample independently, contributing 1 vote to each haplotype it selects. After all N samples have been run, we sort the haplotypes by their counts, and take 2 * nSample + 1 haplotypes or maxHaplotypesInPopulation, whichever is smaller. -- After discussing with Mauricio our view is that the algorithmic complexity of this approach is no worse than the previous approach, so it should be equivalently fast. -- One potential improvement is to use not hard counts for the haplotypes, but this would radically complicate the current algorithm so it wasn't selected. -- For an example of a specific problem caused by this, see https://jira.broadinstitute.org/browse/GSA-871. -- Remove old pooled likelihood model. It's worse than the current version in both single and multiple samples: 1000G EUR samples: 10Kb per sample: 7.17 minutes pooled: 7.36 minutes Name VariantType TRUE_POSITIVE FALSE_POSITIVE FALSE_NEGATIVE TRUE_NEGATIVE CALLED_NOT_IN_DB_AT_ALL per_sample SNPS 50 0 5 8 1 per_sample INDELS 6 0 7 2 1 pooled SNPS 49 0 6 8 1 pooled INDELS 5 0 8 2 1 100 kb per sample: 140.00 minutes pooled: 145.27 minutes Name VariantType TRUE_POSITIVE FALSE_POSITIVE FALSE_NEGATIVE TRUE_NEGATIVE CALLED_NOT_IN_DB_AT_ALL per_sample SNPS 144 0 22 28 1 per_sample INDELS 28 1 16 9 11 pooled SNPS 143 0 23 28 1 pooled INDELS 27 1 17 9 11 java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T HaplotypeCaller -I private/testdata/AFR.structural.indels.bam -L 20:8187565-8187800 -L 20:18670537-18670730 -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -o /dev/null -debug haplotypes from samples: 8 seconds haplotypes from pools: 8 seconds java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T HaplotypeCaller -I /Users/depristo/Desktop/broadLocal/localData/phaseIII.4x.100kb.bam -L 20:10,000,000-10,001,000 -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -o /dev/null -debug haplotypes from samples: 173.32 seconds haplotypes from pools: 167.12 seconds --- .../haplotypecaller/HaplotypeCaller.java | 11 +- .../LikelihoodCalculationEngine.java | 147 +++++++++++++----- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 14 +- .../genotyper/MostLikelyAlleleUnitTest.java | 11 +- .../PerReadAlleleLikelihoodMapUnitTest.java | 53 ++++++- .../utils/genotyper/MostLikelyAllele.java | 9 +- .../genotyper/PerReadAlleleLikelihoodMap.java | 60 ++++++- 8 files changed, 252 insertions(+), 55 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 55490a1cb..cd3847b81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); - if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + if ( DEBUG ) { + logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); + for ( final Haplotype remaining: trimmedHaplotypes ) { + logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); + } + } + // trim down the reads and add them to the trimmed active region final List trimmedReads = new ArrayList(originalActiveRegion.getReads().size()); @@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * @return the list of haplotypes to genotype */ protected List selectBestHaplotypesForGenotyping(final List haplotypes, final Map stratifiedReadMap) { - // TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping) if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return haplotypes; } else { - return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); + return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 1fb873e81..8697833a6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine { return likelihoodMatrix; } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.size() <= haplotypes.size()"}) - public List selectBestHaplotypesFromPooledLikelihoods(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + // -------------------------------------------------------------------------------- + // + // System to compute the best N haplotypes for genotyping + // + // -------------------------------------------------------------------------------- - final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = stratifiedReadMap.keySet(); - final List bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - final List haplotypesAsAlleles = new ArrayList(); - for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); } + /** + * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele + * @param map an annoying map object that moves us between the allele and haplotype representation + * @param haplotypeAsAllele the allele version of the haplotype + * @return the haplotype version, with its score incremented by 1 if its non-reference + */ + private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { + final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic + if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value + return h; + } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together + /** + * Take the best N haplotypes and return them as a list + * + * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample + * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing + * order of score (so higher score haplotypes are preferred). The N we take is determined by + * + * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) + * + * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is + * bounded by maxNumHaplotypesInPopulation as that number can grow without bound + * + * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 + * @param nSamples the number of samples used to select the haplotypes + * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples + * @return a list of N or fewer haplotypes, with the reference haplotype first + */ + private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { + final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); + Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); + final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; + final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); + final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); + if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); + return bestHaplotypes; + } - int hap1 = 0; - int hap2 = 0; - //double bestElement = Double.NEGATIVE_INFINITY; - final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 ); - while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { - double maxElement = Double.NEGATIVE_INFINITY; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) { - maxElement = haplotypeLikelihoodMatrix[iii][jjj]; - hap1 = iii; - hap2 = jjj; - } - } - } - if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); } - haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY; + /** + * Select the best haplotypes for genotyping the samples in stratifiedReadMap + * + * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely + * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for + * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get + * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation + * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the + * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. + * + * @param haplotypes a list of all haplotypes we're considering + * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype + * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes + * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation + */ + public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { + if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } - if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } + if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes + + // all of the haplotypes that at least one sample called as one of the most likely + final Set selectedHaplotypes = new HashSet(); + selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected + + // our annoying map from allele -> haplotype + final Map allele2Haplotype = new HashMap(); + for ( final Haplotype h : haplotypes ) { + h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes + allele2Haplotype.put(Allele.create(h, h.isReference()), h); } - if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } + // for each sample, compute the most likely pair of haplotypes + for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { + // get the two most likely haplotypes under a diploid model for this sample + final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - final List bestHaplotypes = new ArrayList(); - for( final int hIndex : bestHaplotypesIndexList ) { - bestHaplotypes.add( haplotypes.get(hIndex) ); + if ( mla != null ) { // there was something to evaluate in this sample + // note that there must be at least 2 haplotypes + final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); + final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); + +// if ( DEBUG ) { +// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +// } + + // add these two haplotypes to the set of haplotypes that have been selected + selectedHaplotypes.add(best); + selectedHaplotypes.add(second); + + // we've already selected all of our haplotypes, and we don't need to prune them down + if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) + break; + } + } + + // take the best N haplotypes forward, in order of the number of samples that choose them + final int nSamples = stratifiedReadMap.size(); + final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); + + if ( DEBUG ) { + logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); + for ( final Haplotype h : bestHaplotypes ) { + logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); + } } return bestHaplotypes; } - public static int findReferenceIndex( final List haplotypes ) { + /** + * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found + * @param haplotypes non-null list of haplotypes + * @return the reference haplotype + */ + private static Haplotype findReferenceHaplotype( final List haplotypes ) { for( final Haplotype h : haplotypes ) { - if( h.isReference() ) { return haplotypes.indexOf(h); } + if( h.isReference() ) return h; } throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f09711094..92cf45652 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 9cd225df3..a308b4893 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a"); + HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988"); + HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897")); executeTest("HCTestStructuralIndels: ", spec); } @@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a62597f2c005f373efbe398ab51a2f1")); + Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd")); + Arrays.asList("8adfa8a27a312760dab50787da595c57")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java index cf077392b..08d82281e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/MostLikelyAlleleUnitTest.java @@ -53,12 +53,14 @@ import org.testng.annotations.Test; public class MostLikelyAlleleUnitTest extends BaseTest { final Allele a = Allele.create("A"); + final Allele b = Allele.create("C"); @Test public void testBasicCreation() { final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1; - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), b); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second); @@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testNotDefaultInformative() { final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2); - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second); + MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second); Assert.assertEquals(mla.isInformative(), false); Assert.assertEquals(mla.isInformative(10), false); Assert.assertEquals(mla.isInformative(0), true); @@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoGoodSecond() { - MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), a); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), null); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); @@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest { @Test public void testCreationNoAllele() { - MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); + MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL); Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY); Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY); diff --git a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java index c50849a54..6ca49d3e5 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMapUnitTest.java @@ -47,11 +47,9 @@ package org.broadinstitute.sting.utils.genotyper; import net.sf.samtools.*; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest { Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood); Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood); } + + @DataProvider(name = "MostLikelyAlleleData") + public Object[][] makeMostLikelyAlleleData() { + List tests = new ArrayList(); + + final Allele a = Allele.create("A"); + final Allele c = Allele.create("C"); + final Allele g = Allele.create("G"); + + tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-100.0, 0.0, -10.0)), + c, a}); + tests.add(new Object[]{Arrays.asList(a, c, g), + Arrays.asList( + Arrays.asList(0.0, -10.0, -10.0), + Arrays.asList(-20.0, 0.0, -100.0)), + c, a}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MostLikelyAlleleData") + public void testMostLikelyAllele(final List alleles, final List> perReadlikelihoods, final Allele best, final Allele second) { + final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap(); + + for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) { + final List likelihoods = perReadlikelihoods.get(readI); + + final byte[] bases = Utils.dupBytes((byte)'A', 10); + final byte[] quals = Utils.dupBytes((byte) 30, 10); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M"); + read.setReadName("readName" + readI); + + for ( int i = 0; i < alleles.size(); i++ ) { + final Allele allele = alleles.get(i); + final double likelihood = likelihoods.get(i); + map.add(read, allele, likelihood); + } + } + + final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles(); + Assert.assertEquals(mla.getMostLikelyAllele(), best); + Assert.assertEquals(mla.getSecondMostLikelyAllele(), second); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java index e12fb546c..03a2b8077 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -50,6 +50,7 @@ public final class MostLikelyAllele { public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2; final Allele mostLikely; + final Allele secondLikely; final double log10LikelihoodOfMostLikely; final double log10LikelihoodOfSecondBest; @@ -60,10 +61,11 @@ public final class MostLikelyAllele { * mostLikely should be a NO_CALL allele. * * @param mostLikely the most likely allele + * @param secondMostLikely the most likely allele after mostLikely * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) */ - public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); @@ -73,6 +75,7 @@ public final class MostLikelyAllele { throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest); this.mostLikely = mostLikely; + this.secondLikely = secondMostLikely; this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely; this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest; } @@ -81,6 +84,10 @@ public final class MostLikelyAllele { return mostLikely; } + public Allele getSecondMostLikelyAllele() { + return secondLikely; + } + public double getLog10LikelihoodOfMostLikely() { return log10LikelihoodOfMostLikely; } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 47be30871..8134b1257 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper; import com.google.java.contract.Ensures; +import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import java.io.PrintStream; @@ -41,6 +44,7 @@ import java.util.*; * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. */ public class PerReadAlleleLikelihoodMap { + private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class); protected List alleles; protected Map> likelihoodReadMap; @@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final GATKSAMRecord read = entry.getKey(); + final int count = ReadUtils.getMeanRepresentativeReadCount(read); + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF); + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + private static final double LOG_ONE_HALF = -Math.log10(2.0); /** * Given a map from alleles to likelihoods, find the allele with the largest likelihood. @@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap { double maxLike = Double.NEGATIVE_INFINITY; double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; for (final Map.Entry el : alleleMap.entrySet()) { if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) @@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap { if (el.getValue() > maxLike) { prevMaxLike = maxLike; maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; mostLikelyAllele = el.getKey(); } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); prevMaxLike = el.getValue(); } } - return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike); + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); } /**