Merge pull request #166 from broadinstitute/md_hc_persample_haplotypes

Select the haplotypes we move forward for genotyping per sample, not poo...
This commit is contained in:
Ryan Poplin 2013-04-16 08:46:56 -07:00
commit 936f4da1f6
8 changed files with 252 additions and 55 deletions

View File

@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
if ( DEBUG ) {
logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
for ( final Haplotype remaining: trimmedHaplotypes ) {
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
}
}
// trim down the reads and add them to the trimmed active region
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
* @return the list of haplotypes to genotype
*/
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
// TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping)
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
return haplotypes;
} else {
return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
}
}

View File

@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.haplotype.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine {
return likelihoodMatrix;
}
@Requires({"haplotypes.size() > 0"})
@Ensures({"result.size() <= haplotypes.size()"})
public List<Haplotype> selectBestHaplotypesFromPooledLikelihoods(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
// --------------------------------------------------------------------------------
//
// System to compute the best N haplotypes for genotyping
//
// --------------------------------------------------------------------------------
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
final List<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); }
/**
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
* @param map an annoying map object that moves us between the allele and haplotype representation
* @param haplotypeAsAllele the allele version of the haplotype
* @return the haplotype version, with its score incremented by 1 if its non-reference
*/
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
return h;
}
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together
/**
* Take the best N haplotypes and return them as a list
*
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
* order of score (so higher score haplotypes are preferred). The N we take is determined by
*
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
*
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
*
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
* @param nSamples the number of samples used to select the haplotypes
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
* @return a list of N or fewer haplotypes, with the reference haplotype first
*/
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
return bestHaplotypes;
}
int hap1 = 0;
int hap2 = 0;
//double bestElement = Double.NEGATIVE_INFINITY;
final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 );
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
double maxElement = Double.NEGATIVE_INFINITY;
for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) {
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
hap1 = iii;
hap2 = jjj;
}
}
}
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
/**
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
*
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
*
* @param haplotypes a list of all haplotypes we're considering
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
*/
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
// all of the haplotypes that at least one sample called as one of the most likely
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
// our annoying map from allele -> haplotype
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
for ( final Haplotype h : haplotypes ) {
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
}
if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
// for each sample, compute the most likely pair of haplotypes
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// get the two most likely haplotypes under a diploid model for this sample
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
final List<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
for( final int hIndex : bestHaplotypesIndexList ) {
bestHaplotypes.add( haplotypes.get(hIndex) );
if ( mla != null ) { // there was something to evaluate in this sample
// note that there must be at least 2 haplotypes
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
// if ( DEBUG ) {
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
// }
// add these two haplotypes to the set of haplotypes that have been selected
selectedHaplotypes.add(best);
selectedHaplotypes.add(second);
// we've already selected all of our haplotypes, and we don't need to prune them down
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
break;
}
}
// take the best N haplotypes forward, in order of the number of samples that choose them
final int nSamples = stratifiedReadMap.size();
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
if ( DEBUG ) {
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
for ( final Haplotype h : bestHaplotypes ) {
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
}
}
return bestHaplotypes;
}
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
/**
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
* @param haplotypes non-null list of haplotypes
* @return the reference haplotype
*/
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) { return haplotypes.indexOf(h); }
if( h.isReference() ) return h;
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a");
HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988");
HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852");
HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("8a62597f2c005f373efbe398ab51a2f1"));
Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd"));
Arrays.asList("8adfa8a27a312760dab50787da595c57"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
}

View File

@ -53,12 +53,14 @@ import org.testng.annotations.Test;
public class MostLikelyAlleleUnitTest extends BaseTest {
final Allele a = Allele.create("A");
final Allele b = Allele.create("C");
@Test
public void testBasicCreation() {
final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1;
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
Assert.assertEquals(mla.getMostLikelyAllele(), a);
Assert.assertEquals(mla.getSecondMostLikelyAllele(), b);
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second);
@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
@Test
public void testNotDefaultInformative() {
final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2);
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
Assert.assertEquals(mla.isInformative(), false);
Assert.assertEquals(mla.isInformative(10), false);
Assert.assertEquals(mla.isInformative(0), true);
@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
@Test
public void testCreationNoGoodSecond() {
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY);
MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY);
Assert.assertEquals(mla.getMostLikelyAllele(), a);
Assert.assertEquals(mla.getSecondMostLikelyAllele(), null);
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
@Test
public void testCreationNoAllele() {
MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL);
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY);
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);

View File

@ -47,11 +47,9 @@
package org.broadinstitute.sting.utils.genotyper;
import net.sf.samtools.*;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood);
Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood);
}
@DataProvider(name = "MostLikelyAlleleData")
public Object[][] makeMostLikelyAlleleData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele a = Allele.create("A");
final Allele c = Allele.create("C");
final Allele g = Allele.create("G");
tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a});
tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a});
tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c});
tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
tests.add(new Object[]{Arrays.asList(a, c, g),
Arrays.asList(
Arrays.asList(0.0, -10.0, -10.0),
Arrays.asList(-100.0, 0.0, -10.0)),
c, a});
tests.add(new Object[]{Arrays.asList(a, c, g),
Arrays.asList(
Arrays.asList(0.0, -10.0, -10.0),
Arrays.asList(-20.0, 0.0, -100.0)),
c, a});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "MostLikelyAlleleData")
public void testMostLikelyAllele(final List<Allele> alleles, final List<List<Double>> perReadlikelihoods, final Allele best, final Allele second) {
final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) {
final List<Double> likelihoods = perReadlikelihoods.get(readI);
final byte[] bases = Utils.dupBytes((byte)'A', 10);
final byte[] quals = Utils.dupBytes((byte) 30, 10);
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M");
read.setReadName("readName" + readI);
for ( int i = 0; i < alleles.size(); i++ ) {
final Allele allele = alleles.get(i);
final double likelihood = likelihoods.get(i);
map.add(read, allele, likelihood);
}
}
final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles();
Assert.assertEquals(mla.getMostLikelyAllele(), best);
Assert.assertEquals(mla.getSecondMostLikelyAllele(), second);
}
}

View File

@ -50,6 +50,7 @@ public final class MostLikelyAllele {
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2;
final Allele mostLikely;
final Allele secondLikely;
final double log10LikelihoodOfMostLikely;
final double log10LikelihoodOfSecondBest;
@ -60,10 +61,11 @@ public final class MostLikelyAllele {
* mostLikely should be a NO_CALL allele.
*
* @param mostLikely the most likely allele
* @param secondMostLikely the most likely allele after mostLikely
* @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele
* @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available)
*/
public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null");
if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) )
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely);
@ -73,6 +75,7 @@ public final class MostLikelyAllele {
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest);
this.mostLikely = mostLikely;
this.secondLikely = secondMostLikely;
this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely;
this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest;
}
@ -81,6 +84,10 @@ public final class MostLikelyAllele {
return mostLikely;
}
public Allele getSecondMostLikelyAllele() {
return secondLikely;
}
public double getLog10LikelihoodOfMostLikely() {
return log10LikelihoodOfMostLikely;
}

View File

@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper;
import com.google.java.contract.Ensures;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.PrintStream;
@ -41,6 +44,7 @@ import java.util.*;
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
*/
public class PerReadAlleleLikelihoodMap {
private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
protected List<Allele> alleles;
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.get(p.getRead());
}
/**
* Get the most likely alleles estimated across all reads in this object
*
* Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for
* each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum
* i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood.
*
* Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele
* doesn't have a meaningful get best likelihood.
*
* @return a MostLikelyAllele object, or null if this map is empty
*/
public MostLikelyAllele getMostLikelyDiploidAlleles() {
if ( isEmpty() ) return null;
int hap1 = 0;
int hap2 = 0;
double maxElement = Double.NEGATIVE_INFINITY;
for( int iii = 0; iii < alleles.size(); iii++ ) {
final Allele iii_allele = alleles.get(iii);
for( int jjj = 0; jjj <= iii; jjj++ ) {
final Allele jjj_allele = alleles.get(jjj);
double haplotypeLikelihood = 0.0;
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : likelihoodReadMap.entrySet() ) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
final GATKSAMRecord read = entry.getKey();
final int count = ReadUtils.getMeanRepresentativeReadCount(read);
final double likelihood_iii = entry.getValue().get(iii_allele);
final double likelihood_jjj = entry.getValue().get(jjj_allele);
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF);
// fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair
if ( haplotypeLikelihood < maxElement ) break;
}
// keep track of the max element and associated indices
if ( haplotypeLikelihood > maxElement ) {
hap1 = iii;
hap2 = jjj;
maxElement = haplotypeLikelihood;
}
}
}
if ( maxElement == Double.NEGATIVE_INFINITY )
throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong");
return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement);
}
private static final double LOG_ONE_HALF = -Math.log10(2.0);
/**
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.
@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap {
double maxLike = Double.NEGATIVE_INFINITY;
double prevMaxLike = Double.NEGATIVE_INFINITY;
Allele mostLikelyAllele = Allele.NO_CALL;
Allele secondMostLikely = null;
for (final Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) )
@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap {
if (el.getValue() > maxLike) {
prevMaxLike = maxLike;
maxLike = el.getValue();
secondMostLikely = mostLikelyAllele;
mostLikelyAllele = el.getKey();
} else if( el.getValue() > prevMaxLike ) {
secondMostLikely = el.getKey();
prevMaxLike = el.getValue();
}
}
return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike);
return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike);
}
/**