PairHMM rework

The current implementation of the PairHMM had issues with the probabilities and the state machines. Probabilities were not adding up to one because:
   # Initial conditions were not being set properly
   # Emission probabilities in the last row were not adding up to 1

The following commit fixes both by
   # averaging all potential start locations (giving an equal prior to the state machine in it's first iteration -- allowing the read to start it's alignment anywhere in the haplotype with equal probability)
   # discounting all paths that end in deletions by not adding the last row of the deletion matrix and summing over all paths ending in matches and insertions (this saves us from a fourth matrix to represent the end state)

Summarized changes:
   * Fix LoglessCachingPairHMM and Log10PairHMM according to the new algorithm
   * Refactor probabilities check to throw exception if we ever encounter probabilities greater than 1.
   * Rename LoglessCachingPairHMM to LoglessPairHMM (this is the default implementation in the HC now)
   * Rename matrices to matchMatrix, insertionMatrix and deletionMatrix for clarity
   * Rename metric lengths to read and haplotype lengths for clarity
   * Rename private methods to initializePriors (distance) and initializeProbabilities (constants) for clarity
   * Eliminate first row constants (because they're not used anyway!) and directly assign initial conditions in the deletionMatrix
   * Remove unnecessary parameters from updateCell()
   * Fix the expected probabilities coming from the exact model in PairHMMUnitTest
   * Neatify PairHMM class (removed unused methods) and PairHMMUnitTest (removed unused variables)
   * Update MD5s: Probabilities have changed according to the new PairHMM model and as expected HC and UG integration tests have new MD5s.

[fix 47164949]
This commit is contained in:
Mauricio Carneiro 2013-03-26 15:42:41 -04:00
parent f42bdaee3c
commit 0de6f55660
13 changed files with 130 additions and 172 deletions

View File

@ -80,7 +80,7 @@ public class LikelihoodCalculationEngine {
pairHMM = new Log10PairHMM(false);
break;
case LOGLESS_CACHING:
pairHMM = new LoglessCachingPairHMM();
pairHMM = new LoglessPairHMM();
break;
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");

View File

@ -55,25 +55,14 @@ import org.broadinstitute.sting.utils.QualityUtils;
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class LoglessCachingPairHMM extends PairHMM {
public class LoglessPairHMM extends PairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10);
double[][] constantMatrix = null; // The cache
double[][] distanceMatrix = null; // The cache
double[][] transition = null; // The cache
double[][] prior = null; // The cache
boolean constantsAreInitialized = false;
/**
* Cached data structure that describes the first row's edge condition in the HMM
*/
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProb(DEFAULT_GCP),
QualityUtils.qualToErrorProb(DEFAULT_GOP),
QualityUtils.qualToErrorProb(DEFAULT_GCP),
1.0,
1.0
};
/**
* {@inheritDoc}
*/
@ -81,8 +70,8 @@ public class LoglessCachingPairHMM extends PairHMM {
public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
super.initialize(haplotypeMaxLength, readMaxLength);
constantMatrix = new double[X_METRIC_MAX_LENGTH][6];
distanceMatrix = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH];
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
}
/**
@ -98,8 +87,8 @@ public class LoglessCachingPairHMM extends PairHMM {
final int hapStartIndex,
final boolean recacheReadValues ) {
if ( ! constantsAreInitialized || recacheReadValues )
initializeConstants( haplotypeBases.length, readBases.length, insertionGOP, deletionGOP, overallGCP );
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP);
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
// NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this
// read and haplotype lengths, not the max lengths
@ -109,14 +98,19 @@ public class LoglessCachingPairHMM extends PairHMM {
for (int i = 2; i < readXMetricLength; i++) {
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
for (int j = hapStartIndex+1; j < hapYMetricLength; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
updateCell(i, j, prior[i][j], transition[i]);
}
}
// final probability is the log10 sum of the last element in all three state arrays
// final probability is the log10 sum of the last element in the Match and Insertion state arrays
// this way we ignore all paths that ended in deletions! (huge)
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
final int endI = readXMetricLength - 1;
final int endJ = hapYMetricLength - 1;
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
double finalSumProbabilities = 0.0;
for (int j = 0; j < hapYMetricLength; j++) {
finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j];
}
return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10;
}
/**
@ -128,10 +122,7 @@ public class LoglessCachingPairHMM extends PairHMM {
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
@ -141,7 +132,7 @@ public class LoglessCachingPairHMM extends PairHMM {
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
}
}
@ -150,46 +141,36 @@ public class LoglessCachingPairHMM extends PairHMM {
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
@Requires({
"haplotypeSize > 0",
"readSize > 0",
"insertionGOP != null && insertionGOP.length == readSize",
"deletionGOP != null && deletionGOP.length == readSize",
"overallGCP != null && overallGCP.length == readSize"
"insertionGOP != null",
"deletionGOP != null",
"overallGCP != null"
})
@Ensures("constantsAreInitialized")
private void initializeConstants( final int haplotypeSize,
final int readSize,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
// the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10) / getNPotentialXStarts(haplotypeSize, readSize);
final double initialValue = INITIAL_CONDITION / haplotypeLength;
matchMatrix[1][1] = initialValue;
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_MAX_LENGTH; jjj++ ) {
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) {
deletionMatrix[1][jjj] = initialValue;
}
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
}
constantMatrix[l+1][4] = 1.0;
constantMatrix[l+1][5] = 1.0;
// note that we initialized the constants
constantsAreInitialized = true;
@ -204,18 +185,14 @@ public class LoglessCachingPairHMM extends PairHMM {
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
* @param transitition an array with the six transitition relevant to this location
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) {
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
XMetricArray[indI - 1][indJ - 1] * constants[1] +
YMetricArray[indI - 1][indJ - 1] * constants[1] );
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] +
insertionMatrix[indI - 1][indJ - 1] * transitition[1] +
deletionMatrix[indI - 1][indJ - 1] * transitition[1] );
insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3];
deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5];
}
}

View File

@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES;
/**
* Created by IntelliJ IDEA.
@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5812da66811887d834d0379a33e655c0");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025");
}
}

View File

@ -49,7 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS;
public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest {
@ -57,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4");
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716");
}
@Test(enabled = true)

View File

@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b"));
Arrays.asList("51e022d07ead45a4e154f949b6642e84"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("c7e59f9ab718df4c604626a0f51af606"));
Arrays.asList("1d9c6fda344eeee76cbe4221251dc341"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d"));
Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("86880ec78755ae91cb5bb34a0631a32c"));
Arrays.asList("3131cd7c49b623983a106db5228754b3"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28"));
Arrays.asList("273f5daa936e93da98efd6ceb37d7533"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("939da0bb73b706badd8a0def7446b384"));
Arrays.asList("00a003a0908281384e981294434a9f3e"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("556c214366e82e4682e753ce93307a4e"));
Arrays.asList("87521a1bde124c7c5908ed067060fe45"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("1df02b805d9dfbd532fa3632875a989d"));
Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74"));
executeTest("test minIndelFraction 0.25", spec);
}

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a"));
Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af"));
executeTest("test MultiSample Pilot1", spec);
}
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e"));
Arrays.asList("31c0f0074b3306b54170056e93b69e11"));
executeTest("test Multiple SNP alleles", spec);
}
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b"));
Arrays.asList("753d6358b1634107de76900200116805"));
executeTest("test reverse trim", spec);
}
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4"));
Arrays.asList("274eadae8a630a3fda9281d6d6253dea"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38");
testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180");
}

View File

@ -49,10 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.*;
import java.util.Arrays;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF;
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -63,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a898b551f78c71befee4d12070d3a788");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -87,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"e8ffbfae3c1af5be02631a31f386a431");
"8a110549543412fa682419e9a8f0dd1d");
}
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"c3a98b19efa7cb36fe5f5f2ab893ef56");
"5429c234d471434adc09d9e60b87de24");
}
}

View File

@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1");
HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c");
HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478");
}
@Test(enabled = false) // can't annotate the rsID's yet
@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"4ca6b560d0569cdca400d3e50915e211");
"70bd5d0805bf6f51e5f61b377526c979");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa");
}
@Test
@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("4adb833ed8af20224b76bba61e2b0d93"));
Arrays.asList("f0a215faed194dc160f19e26293e85f8"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("1704b0901c86f8f597d931222d5c8dd8"));
Arrays.asList("bea274584344fa6b4b0f98eee327bad8"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
}

View File

@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct });
tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -70,7 +70,7 @@ public class PairHMMUnitTest extends BaseTest {
final static boolean EXTENSIVE_TESTING = true;
final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final PairHMM loglessHMM = new LoglessCachingPairHMM();
final PairHMM loglessHMM = new LoglessPairHMM();
private List<PairHMM> getHMMs() {
return Arrays.asList(exactHMM, originalHMM, loglessHMM);
@ -116,13 +116,12 @@ public class PairHMMUnitTest extends BaseTest {
return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual);
}
public double expectedLogL(final PairHMM hmm) {
return (expectedQual / -10.0) + 0.03 +
hmm.getNPotentialXStartsLikelihoodPenaltyLog10(refBasesWithContext.length, readBasesWithContext.length);
public double expectedLogL() {
return (expectedQual / -10.0) + 0.03 + Math.log10(1.0/refBasesWithContext.length);
}
public double getTolerance(final PairHMM hmm) {
if ( hmm instanceof LoglessCachingPairHMM )
if ( hmm instanceof LoglessPairHMM)
return toleranceFromExact();
if ( hmm instanceof Log10PairHMM ) {
return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference();
@ -150,7 +149,7 @@ public class PairHMMUnitTest extends BaseTest {
qualAsBytes(gcp, false, anchorIndel), 0, true);
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
private byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
@ -163,7 +162,7 @@ public class PairHMMUnitTest extends BaseTest {
// update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
phredQuals[CONTEXT.length()] = (byte)phredQual;
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
@ -270,7 +269,7 @@ public class PairHMMUnitTest extends BaseTest {
final double exactLogL = cfg.calcLogL( exactHMM, true );
for ( final PairHMM hmm : getHMMs() ) {
double actualLogL = cfg.calcLogL( hmm, true );
double expectedLogL = cfg.expectedLogL(hmm);
double expectedLogL = cfg.expectedLogL();
// compare to our theoretical expectation with appropriate tolerance
Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm);
@ -322,8 +321,8 @@ public class PairHMMUnitTest extends BaseTest {
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
// - log10 is because of number of start positions
Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2);
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, expected, 1e-2);
}
}
@ -354,8 +353,8 @@ public class PairHMMUnitTest extends BaseTest {
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
// - log10 is because of number of start positions
Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2);
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, expected, 1e-2);
}
}
@ -406,8 +405,14 @@ public class PairHMMUnitTest extends BaseTest {
Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true);
final double expected = Math.log10(Math.pow(1.0 - QualityUtils.qualToErrorProb(baseQual), readBases.length));
Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read");
double expected = 0;
final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length;
if (readBases.length < refBases.length) {
expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), readBases.length));
} else if (readBases.length > refBases.length) {
expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), refBases.length) * Math.pow(QualityUtils.qualToErrorProb(insQual), readBases.length - refBases.length));
}
Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read " + String.format("readSize=%d refSize=%d", readSize, refSize));
}
@DataProvider(name = "HMMProviderWithBigReads")
@ -472,7 +477,7 @@ public class PairHMMUnitTest extends BaseTest {
Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true);
loglessHMM.dumpMatrices();
// loglessHMM.dumpMatrices();
}
@DataProvider(name = "JustHMMProvider")
@ -610,7 +615,7 @@ public class PairHMMUnitTest extends BaseTest {
public Object[][] makeUninitializedHMMs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new LoglessCachingPairHMM()});
tests.add(new Object[]{new LoglessPairHMM()});
tests.add(new Object[]{new Log10PairHMM(true)});
return tests.toArray(new Object[][]{});

View File

@ -67,10 +67,10 @@ public class Log10PairHMM extends PairHMM {
public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
super.initialize(haplotypeMaxLength, readMaxLength);
for( int iii=0; iii < X_METRIC_MAX_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
for( int iii=0; iii < paddedMaxReadLength; iii++ ) {
Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
}
}
@ -88,7 +88,8 @@ public class Log10PairHMM extends PairHMM {
final boolean recacheReadValues ) {
// the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual
// read and haplotypes, not the maximum
matchMetricArray[1][1] = getNPotentialXStartsLikelihoodPenaltyLog10(haplotypeBases.length, readBases.length);
final double initialValue = Math.log10((double) 1/haplotypeBases.length);
matchMatrix[1][1] = initialValue;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
@ -104,14 +105,17 @@ public class Log10PairHMM extends PairHMM {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
matchMatrix, insertionMatrix, deletionMatrix);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return myLog10SumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]});
double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]});
for (int j = 2; j < Y_METRIC_LENGTH; j++)
result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]});
return result;
}
/**
@ -134,7 +138,7 @@ public class Log10PairHMM extends PairHMM {
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
@ -151,18 +155,18 @@ public class Log10PairHMM extends PairHMM {
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ - 1] + d0, XMetricArray[indI - 1][indJ - 1] + e0, YMetricArray[indI - 1][indJ - 1] + e0});
matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0});
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ] + d1, XMetricArray[indI - 1][indJ] + e1});
insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1});
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]);
final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]);
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI][indJ - 1] + d2, YMetricArray[indI][indJ - 1] + e2});
deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2});
}
}

View File

@ -25,10 +25,12 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
@ -52,11 +54,11 @@ public abstract class PairHMM {
LOGLESS_CACHING
}
protected double[][] matchMetricArray = null;
protected double[][] XMetricArray = null;
protected double[][] YMetricArray = null;
protected double[][] matchMatrix = null;
protected double[][] insertionMatrix = null;
protected double[][] deletionMatrix = null;
protected int maxHaplotypeLength, maxReadLength;
protected int X_METRIC_MAX_LENGTH, Y_METRIC_MAX_LENGTH;
protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
private boolean initialized = false;
/**
@ -72,12 +74,12 @@ public abstract class PairHMM {
maxReadLength = readMaxLength;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
X_METRIC_MAX_LENGTH = readMaxLength + 2;
Y_METRIC_MAX_LENGTH = haplotypeMaxLength + 2;
paddedMaxReadLength = readMaxLength + 2;
paddedMaxHaplotypeLength = haplotypeMaxLength + 2;
matchMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH];
XMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH];
YMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH];
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
initialized = true;
}
@ -124,21 +126,17 @@ public abstract class PairHMM {
double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues);
// TODO -- remove max when PairHMM no longer returns likelihoods >= 0
result = Math.min(result, 0.0);
if ( MathUtils.goodLog10Probability(result) )
return result;
else
throw new IllegalStateException("Bad likelihoods detected: " + result);
// return result;
throw new ReviewedStingException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
}
/**
* To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10
*/
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"})
"readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"})
protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
@ -148,41 +146,13 @@ public abstract class PairHMM {
final int hapStartIndex,
final boolean recacheReadValues );
/**
* How many potential starting locations are a read with readSize bases against a haplotype with haplotypeSize bases?
*
* for example, a 3 bp read against a 5 bp haplotype could potentially start at 1, 2, 3 = 5 - 3 + 1 = 3
* the max value is necessary in the case where the read is longer than the haplotype, in which case
* there's a single unique start site by assumption
*
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @return a positive integer >= 1
*/
@Ensures("result >= 1")
protected int getNPotentialXStarts(final int haplotypeSize, final int readSize) {
return Math.max(haplotypeSize - readSize + 1, 1);
}
/**
* The the log10 probability penalty for the number of potential start sites of the read aginst the haplotype
*
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @return a log10 probability
*/
@Ensures("MathUtils.goodLog10Probability(result)")
protected double getNPotentialXStartsLikelihoodPenaltyLog10(final int haplotypeSize, final int readSize) {
return - Math.log10(getNPotentialXStarts(haplotypeSize, readSize));
}
/**
* Print out the core hmm matrices for debugging
*/
protected void dumpMatrices() {
dumpMatrix("matchMetricArray", matchMetricArray);
dumpMatrix("XMetricArray", XMetricArray);
dumpMatrix("YMetricArray", YMetricArray);
dumpMatrix("matchMetricArray", matchMatrix);
dumpMatrix("insertionMatrix", insertionMatrix);
dumpMatrix("deletionMatrix", deletionMatrix);
}
/**