Merge pull request #166 from broadinstitute/md_hc_persample_haplotypes
Select the haplotypes we move forward for genotyping per sample, not poo...
This commit is contained in:
commit
936f4da1f6
|
|
@ -737,7 +737,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
|
||||
|
||||
if ( DEBUG ) logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
|
||||
if ( DEBUG ) {
|
||||
logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
|
||||
for ( final Haplotype remaining: trimmedHaplotypes ) {
|
||||
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// trim down the reads and add them to the trimmed active region
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
|
||||
|
|
@ -761,11 +767,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* @return the list of haplotypes to genotype
|
||||
*/
|
||||
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
// TODO -- skip this calculation if the list of haplotypes is of size 2 (as we'll always use 2 for genotyping)
|
||||
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
return haplotypes;
|
||||
} else {
|
||||
return likelihoodCalculationEngine.selectBestHaplotypesFromPooledLikelihoods(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
||||
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,12 +49,14 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -253,54 +255,127 @@ public class LikelihoodCalculationEngine {
|
|||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public List<Haplotype> selectBestHaplotypesFromPooledLikelihoods(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// System to compute the best N haplotypes for genotyping
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
|
||||
final List<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
|
||||
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); }
|
||||
/**
|
||||
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
|
||||
* @param map an annoying map object that moves us between the allele and haplotype representation
|
||||
* @param haplotypeAsAllele the allele version of the haplotype
|
||||
* @return the haplotype version, with its score incremented by 1 if its non-reference
|
||||
*/
|
||||
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
|
||||
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
|
||||
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
|
||||
return h;
|
||||
}
|
||||
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles, true ); // all samples pooled together
|
||||
/**
|
||||
* Take the best N haplotypes and return them as a list
|
||||
*
|
||||
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
|
||||
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
|
||||
* order of score (so higher score haplotypes are preferred). The N we take is determined by
|
||||
*
|
||||
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
|
||||
*
|
||||
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
|
||||
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
|
||||
*
|
||||
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
|
||||
* @param nSamples the number of samples used to select the haplotypes
|
||||
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
|
||||
* @return a list of N or fewer haplotypes, with the reference haplotype first
|
||||
*/
|
||||
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
|
||||
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
|
||||
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
|
||||
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
|
||||
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
|
||||
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
|
||||
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
if( DEBUG ) { logger.info("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
|
||||
/**
|
||||
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
|
||||
*
|
||||
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
|
||||
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
|
||||
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
|
||||
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
|
||||
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
|
||||
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
|
||||
*
|
||||
* @param haplotypes a list of all haplotypes we're considering
|
||||
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
|
||||
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
|
||||
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
|
||||
*/
|
||||
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
|
||||
// all of the haplotypes that at least one sample called as one of the most likely
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
|
||||
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
|
||||
// our annoying map from allele -> haplotype
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
}
|
||||
|
||||
if( DEBUG ) { logger.info("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
// for each sample, compute the most likely pair of haplotypes
|
||||
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
|
||||
// get the two most likely haplotypes under a diploid model for this sample
|
||||
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
|
||||
|
||||
final List<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
if ( mla != null ) { // there was something to evaluate in this sample
|
||||
// note that there must be at least 2 haplotypes
|
||||
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
|
||||
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
|
||||
|
||||
// if ( DEBUG ) {
|
||||
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
|
||||
// }
|
||||
|
||||
// add these two haplotypes to the set of haplotypes that have been selected
|
||||
selectedHaplotypes.add(best);
|
||||
selectedHaplotypes.add(second);
|
||||
|
||||
// we've already selected all of our haplotypes, and we don't need to prune them down
|
||||
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// take the best N haplotypes forward, in order of the number of samples that choose them
|
||||
final int nSamples = stratifiedReadMap.size();
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
|
||||
|
||||
if ( DEBUG ) {
|
||||
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
|
||||
for ( final Haplotype h : bestHaplotypes ) {
|
||||
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
|
||||
}
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
|
||||
/**
|
||||
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
|
||||
* @param haplotypes non-null list of haplotypes
|
||||
* @return the reference haplotype
|
||||
*/
|
||||
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isReference() ) { return haplotypes.indexOf(h); }
|
||||
if( h.isReference() ) return h;
|
||||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex1() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d9c176fe6de26bb8b289d55a840d7b8b");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a2a5ae267cc061b0f9148280c8f1e236");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
|
|||
|
|
@ -80,12 +80,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "c8598545d1c76b470a7784e6b5c2ad4a");
|
||||
HCTest(CEUTRIO_BAM, "", "3d5e59b3a74da999ceb967cef73086b4");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "0b2ca4482e92b9606be904cc25ba0988");
|
||||
HCTest(NA12878_BAM, "", "4afef5e9cb99c0292e471d0c95243c1a");
|
||||
}
|
||||
|
||||
@Test(enabled = false) // can't annotate the rsID's yet
|
||||
|
|
@ -112,7 +112,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1eab0eb7a184d981b021a249c3bd0401");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4581b4c2b55b2290a4b1092d2da5e642");
|
||||
}
|
||||
|
||||
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
||||
|
|
@ -149,7 +149,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerNearbySmallIntervals() {
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "6ab938dede6838c983f84225d4103852");
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "b20aaef138ef21a5031c06434f17f685");
|
||||
}
|
||||
|
||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("e8466846ca420bcbcd52b97f7a661aa3"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("9bea757f6aa75e43585e26b246fd8897"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -188,7 +188,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("8a62597f2c005f373efbe398ab51a2f1"));
|
||||
Arrays.asList("a360a9e14c4e9c9b7435ca5d9dfadb8d"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -196,7 +196,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("a913849c7ebdefb23ef9fa5ec05960fd"));
|
||||
Arrays.asList("8adfa8a27a312760dab50787da595c57"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,12 +53,14 @@ import org.testng.annotations.Test;
|
|||
|
||||
public class MostLikelyAlleleUnitTest extends BaseTest {
|
||||
final Allele a = Allele.create("A");
|
||||
final Allele b = Allele.create("C");
|
||||
|
||||
@Test
|
||||
public void testBasicCreation() {
|
||||
final double second = -1 - MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1;
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), a);
|
||||
Assert.assertEquals(mla.getSecondMostLikelyAllele(), b);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), second);
|
||||
|
||||
|
|
@ -73,7 +75,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testNotDefaultInformative() {
|
||||
final double second = -1.0 - (MostLikelyAllele.INFORMATIVE_LIKELIHOOD_THRESHOLD - 1e-2);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, second);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, b, -1.0, second);
|
||||
Assert.assertEquals(mla.isInformative(), false);
|
||||
Assert.assertEquals(mla.isInformative(10), false);
|
||||
Assert.assertEquals(mla.isInformative(0), true);
|
||||
|
|
@ -84,8 +86,9 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testCreationNoGoodSecond() {
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, -1.0, Double.NEGATIVE_INFINITY);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(a, null, -1.0, Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), a);
|
||||
Assert.assertEquals(mla.getSecondMostLikelyAllele(), null);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), -1.0);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
|
||||
|
||||
|
|
@ -99,7 +102,7 @@ public class MostLikelyAlleleUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testCreationNoAllele() {
|
||||
MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
|
||||
MostLikelyAllele mla = new MostLikelyAllele(Allele.NO_CALL, null, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), Allele.NO_CALL);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfMostLikely(), Double.NEGATIVE_INFINITY);
|
||||
Assert.assertEquals(mla.getLog10LikelihoodOfSecondBest(), Double.NEGATIVE_INFINITY);
|
||||
|
|
|
|||
|
|
@ -47,11 +47,9 @@
|
|||
package org.broadinstitute.sting.utils.genotyper;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
|
@ -298,4 +296,55 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
|
|||
Assert.assertTrue(map.getStoredElements().containsAll(goodReads), "nBad " + nBad + " nGood " + nGood);
|
||||
Assert.assertEquals(map.getStoredElements().size(), nGood, "nBad " + nBad + " nGood " + nGood);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MostLikelyAlleleData")
|
||||
public Object[][] makeMostLikelyAlleleData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Allele a = Allele.create("A");
|
||||
final Allele c = Allele.create("C");
|
||||
final Allele g = Allele.create("G");
|
||||
|
||||
tests.add(new Object[]{Arrays.asList(a), Arrays.asList(Arrays.asList(0.0)), a, a});
|
||||
tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(0.0, -1.0)), a, a});
|
||||
tests.add(new Object[]{Arrays.asList(a, c), Arrays.asList(Arrays.asList(-1.0, 0.0)), c, c});
|
||||
tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
|
||||
tests.add(new Object[]{Arrays.asList(a, c, g), Arrays.asList(Arrays.asList(0.0, 0.0, -10.0)), a, a});
|
||||
tests.add(new Object[]{Arrays.asList(a, c, g),
|
||||
Arrays.asList(
|
||||
Arrays.asList(0.0, -10.0, -10.0),
|
||||
Arrays.asList(-100.0, 0.0, -10.0)),
|
||||
c, a});
|
||||
tests.add(new Object[]{Arrays.asList(a, c, g),
|
||||
Arrays.asList(
|
||||
Arrays.asList(0.0, -10.0, -10.0),
|
||||
Arrays.asList(-20.0, 0.0, -100.0)),
|
||||
c, a});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MostLikelyAlleleData")
|
||||
public void testMostLikelyAllele(final List<Allele> alleles, final List<List<Double>> perReadlikelihoods, final Allele best, final Allele second) {
|
||||
final PerReadAlleleLikelihoodMap map = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
for ( int readI = 0; readI < perReadlikelihoods.size(); readI++ ) {
|
||||
final List<Double> likelihoods = perReadlikelihoods.get(readI);
|
||||
|
||||
final byte[] bases = Utils.dupBytes((byte)'A', 10);
|
||||
final byte[] quals = Utils.dupBytes((byte) 30, 10);
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "10M");
|
||||
read.setReadName("readName" + readI);
|
||||
|
||||
for ( int i = 0; i < alleles.size(); i++ ) {
|
||||
final Allele allele = alleles.get(i);
|
||||
final double likelihood = likelihoods.get(i);
|
||||
map.add(read, allele, likelihood);
|
||||
}
|
||||
}
|
||||
|
||||
final MostLikelyAllele mla = map.getMostLikelyDiploidAlleles();
|
||||
Assert.assertEquals(mla.getMostLikelyAllele(), best);
|
||||
Assert.assertEquals(mla.getSecondMostLikelyAllele(), second);
|
||||
}
|
||||
}
|
||||
|
|
@ -50,6 +50,7 @@ public final class MostLikelyAllele {
|
|||
public static final double INFORMATIVE_LIKELIHOOD_THRESHOLD = 0.2;
|
||||
|
||||
final Allele mostLikely;
|
||||
final Allele secondLikely;
|
||||
final double log10LikelihoodOfMostLikely;
|
||||
final double log10LikelihoodOfSecondBest;
|
||||
|
||||
|
|
@ -60,10 +61,11 @@ public final class MostLikelyAllele {
|
|||
* mostLikely should be a NO_CALL allele.
|
||||
*
|
||||
* @param mostLikely the most likely allele
|
||||
* @param secondMostLikely the most likely allele after mostLikely
|
||||
* @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele
|
||||
* @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available)
|
||||
*/
|
||||
public MostLikelyAllele(Allele mostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
|
||||
public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) {
|
||||
if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null");
|
||||
if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) )
|
||||
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely);
|
||||
|
|
@ -73,6 +75,7 @@ public final class MostLikelyAllele {
|
|||
throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be <= log10LikelihoodOfSecondBest but got " + log10LikelihoodOfMostLikely + " vs 2nd " + log10LikelihoodOfSecondBest);
|
||||
|
||||
this.mostLikely = mostLikely;
|
||||
this.secondLikely = secondMostLikely;
|
||||
this.log10LikelihoodOfMostLikely = log10LikelihoodOfMostLikely;
|
||||
this.log10LikelihoodOfSecondBest = log10LikelihoodOfSecondBest;
|
||||
}
|
||||
|
|
@ -81,6 +84,10 @@ public final class MostLikelyAllele {
|
|||
return mostLikely;
|
||||
}
|
||||
|
||||
public Allele getSecondMostLikelyAllele() {
|
||||
return secondLikely;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfMostLikely() {
|
||||
return log10LikelihoodOfMostLikely;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,10 +27,13 @@ package org.broadinstitute.sting.utils.genotyper;
|
|||
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -41,6 +44,7 @@ import java.util.*;
|
|||
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
*/
|
||||
public class PerReadAlleleLikelihoodMap {
|
||||
private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
|
||||
protected List<Allele> alleles;
|
||||
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
|
||||
|
||||
|
|
@ -187,6 +191,57 @@ public class PerReadAlleleLikelihoodMap {
|
|||
return likelihoodReadMap.get(p.getRead());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the most likely alleles estimated across all reads in this object
|
||||
*
|
||||
* Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for
|
||||
* each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum
|
||||
* i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood.
|
||||
*
|
||||
* Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele
|
||||
* doesn't have a meaningful get best likelihood.
|
||||
*
|
||||
* @return a MostLikelyAllele object, or null if this map is empty
|
||||
*/
|
||||
public MostLikelyAllele getMostLikelyDiploidAlleles() {
|
||||
if ( isEmpty() ) return null;
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int iii = 0; iii < alleles.size(); iii++ ) {
|
||||
final Allele iii_allele = alleles.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Allele jjj_allele = alleles.get(jjj);
|
||||
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
final GATKSAMRecord read = entry.getKey();
|
||||
final int count = ReadUtils.getMeanRepresentativeReadCount(read);
|
||||
final double likelihood_iii = entry.getValue().get(iii_allele);
|
||||
final double likelihood_jjj = entry.getValue().get(jjj_allele);
|
||||
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + LOG_ONE_HALF);
|
||||
|
||||
// fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair
|
||||
if ( haplotypeLikelihood < maxElement ) break;
|
||||
}
|
||||
|
||||
// keep track of the max element and associated indices
|
||||
if ( haplotypeLikelihood > maxElement ) {
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
maxElement = haplotypeLikelihood;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( maxElement == Double.NEGATIVE_INFINITY )
|
||||
throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong");
|
||||
|
||||
return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement);
|
||||
}
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
|
||||
/**
|
||||
* Given a map from alleles to likelihoods, find the allele with the largest likelihood.
|
||||
|
|
@ -213,6 +268,7 @@ public class PerReadAlleleLikelihoodMap {
|
|||
double maxLike = Double.NEGATIVE_INFINITY;
|
||||
double prevMaxLike = Double.NEGATIVE_INFINITY;
|
||||
Allele mostLikelyAllele = Allele.NO_CALL;
|
||||
Allele secondMostLikely = null;
|
||||
|
||||
for (final Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
|
||||
if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) )
|
||||
|
|
@ -221,13 +277,15 @@ public class PerReadAlleleLikelihoodMap {
|
|||
if (el.getValue() > maxLike) {
|
||||
prevMaxLike = maxLike;
|
||||
maxLike = el.getValue();
|
||||
secondMostLikely = mostLikelyAllele;
|
||||
mostLikelyAllele = el.getKey();
|
||||
} else if( el.getValue() > prevMaxLike ) {
|
||||
secondMostLikely = el.getKey();
|
||||
prevMaxLike = el.getValue();
|
||||
}
|
||||
}
|
||||
|
||||
return new MostLikelyAllele(mostLikelyAllele, maxLike, prevMaxLike);
|
||||
return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue