PairHMM rework

The current implementation of the PairHMM had issues with the probabilities and the state machines. Probabilities were not adding up to one because:
   # Initial conditions were not being set properly
   # Emission probabilities in the last row were not adding up to 1

The following commit fixes both by
   # averaging all potential start locations (giving an equal prior to the state machine in it's first iteration -- allowing the read to start it's alignment anywhere in the haplotype with equal probability)
   # discounting all paths that end in deletions by not adding the last row of the deletion matrix and summing over all paths ending in matches and insertions (this saves us from a fourth matrix to represent the end state)

Summarized changes:
   * Fix LoglessCachingPairHMM and Log10PairHMM according to the new algorithm
   * Refactor probabilities check to throw exception if we ever encounter probabilities greater than 1.
   * Rename LoglessCachingPairHMM to LoglessPairHMM (this is the default implementation in the HC now)
   * Rename matrices to matchMatrix, insertionMatrix and deletionMatrix for clarity
   * Rename metric lengths to read and haplotype lengths for clarity
   * Rename private methods to initializePriors (distance) and initializeProbabilities (constants) for clarity
   * Eliminate first row constants (because they're not used anyway!) and directly assign initial conditions in the deletionMatrix
   * Remove unnecessary parameters from updateCell()
   * Fix the expected probabilities coming from the exact model in PairHMMUnitTest
   * Neatify PairHMM class (removed unused methods) and PairHMMUnitTest (removed unused variables)
   * Update MD5s: Probabilities have changed according to the new PairHMM model and as expected HC and UG integration tests have new MD5s.

[fix 47164949]
This commit is contained in:
Mauricio Carneiro 2013-03-26 15:42:41 -04:00
parent f42bdaee3c
commit 0de6f55660
13 changed files with 130 additions and 172 deletions

View File

@ -80,7 +80,7 @@ public class LikelihoodCalculationEngine {
pairHMM = new Log10PairHMM(false); pairHMM = new Log10PairHMM(false);
break; break;
case LOGLESS_CACHING: case LOGLESS_CACHING:
pairHMM = new LoglessCachingPairHMM(); pairHMM = new LoglessPairHMM();
break; break;
default: default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");

View File

@ -55,25 +55,14 @@ import org.broadinstitute.sting.utils.QualityUtils;
* User: rpoplin, carneiro * User: rpoplin, carneiro
* Date: 10/16/12 * Date: 10/16/12
*/ */
public class LoglessCachingPairHMM extends PairHMM { public class LoglessPairHMM extends PairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0; protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10);
double[][] constantMatrix = null; // The cache double[][] transition = null; // The cache
double[][] distanceMatrix = null; // The cache double[][] prior = null; // The cache
boolean constantsAreInitialized = false; boolean constantsAreInitialized = false;
/**
* Cached data structure that describes the first row's edge condition in the HMM
*/
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProb(DEFAULT_GCP),
QualityUtils.qualToErrorProb(DEFAULT_GOP),
QualityUtils.qualToErrorProb(DEFAULT_GCP),
1.0,
1.0
};
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@ -81,8 +70,8 @@ public class LoglessCachingPairHMM extends PairHMM {
public void initialize( final int haplotypeMaxLength, final int readMaxLength) { public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
super.initialize(haplotypeMaxLength, readMaxLength); super.initialize(haplotypeMaxLength, readMaxLength);
constantMatrix = new double[X_METRIC_MAX_LENGTH][6]; transition = new double[paddedMaxReadLength][6];
distanceMatrix = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
} }
/** /**
@ -98,8 +87,8 @@ public class LoglessCachingPairHMM extends PairHMM {
final int hapStartIndex, final int hapStartIndex,
final boolean recacheReadValues ) { final boolean recacheReadValues ) {
if ( ! constantsAreInitialized || recacheReadValues ) if ( ! constantsAreInitialized || recacheReadValues )
initializeConstants( haplotypeBases.length, readBases.length, insertionGOP, deletionGOP, overallGCP ); initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP);
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
// NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this // NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this
// read and haplotype lengths, not the max lengths // read and haplotype lengths, not the max lengths
@ -109,14 +98,19 @@ public class LoglessCachingPairHMM extends PairHMM {
for (int i = 2; i < readXMetricLength; i++) { for (int i = 2; i < readXMetricLength; i++) {
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
for (int j = hapStartIndex+1; j < hapYMetricLength; j++) { for (int j = hapStartIndex+1; j < hapYMetricLength; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); updateCell(i, j, prior[i][j], transition[i]);
} }
} }
// final probability is the log10 sum of the last element in all three state arrays // final probability is the log10 sum of the last element in the Match and Insertion state arrays
// this way we ignore all paths that ended in deletions! (huge)
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
final int endI = readXMetricLength - 1; final int endI = readXMetricLength - 1;
final int endJ = hapYMetricLength - 1; double finalSumProbabilities = 0.0;
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; for (int j = 0; j < hapYMetricLength; j++) {
finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j];
}
return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10;
} }
/** /**
@ -128,10 +122,7 @@ public class LoglessCachingPairHMM extends PairHMM {
* @param readQuals the base quality scores of the read * @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/ */
public void initializeDistanceMatrix( final byte[] haplotypeBases, public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
@ -141,7 +132,7 @@ public class LoglessCachingPairHMM extends PairHMM {
final byte qual = readQuals[i]; final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) { for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j]; final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
} }
} }
@ -150,46 +141,36 @@ public class LoglessCachingPairHMM extends PairHMM {
/** /**
* Initializes the matrix that holds all the constants related to quality scores. * Initializes the matrix that holds all the constants related to quality scores.
* *
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @param insertionGOP insertion quality scores of the read * @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read * @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty * @param overallGCP overall gap continuation penalty
*/ */
@Requires({ @Requires({
"haplotypeSize > 0", "insertionGOP != null",
"readSize > 0", "deletionGOP != null",
"insertionGOP != null && insertionGOP.length == readSize", "overallGCP != null"
"deletionGOP != null && deletionGOP.length == readSize",
"overallGCP != null && overallGCP.length == readSize"
}) })
@Ensures("constantsAreInitialized") @Ensures("constantsAreInitialized")
private void initializeConstants( final int haplotypeSize, private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
final int readSize,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
// the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init // the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10) / getNPotentialXStarts(haplotypeSize, readSize); final double initialValue = INITIAL_CONDITION / haplotypeLength;
matchMatrix[1][1] = initialValue;
// fill in the first row // fill in the first row
for( int jjj = 2; jjj < Y_METRIC_MAX_LENGTH; jjj++ ) { for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) {
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); deletionMatrix[1][jjj] = initialValue;
} }
final int l = insertionGOP.length; final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) { for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
} }
constantMatrix[l+1][4] = 1.0;
constantMatrix[l+1][5] = 1.0;
// note that we initialized the constants // note that we initialized the constants
constantsAreInitialized = true; constantsAreInitialized = true;
@ -204,18 +185,14 @@ public class LoglessCachingPairHMM extends PairHMM {
* @param indI row index in the matrices to update * @param indI row index in the matrices to update
* @param indJ column index in the matrices to update * @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype * @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location * @param transitition an array with the six transitition relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/ */
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) {
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] +
XMetricArray[indI - 1][indJ - 1] * constants[1] + insertionMatrix[indI - 1][indJ - 1] * transitition[1] +
YMetricArray[indI - 1][indJ - 1] * constants[1] ); deletionMatrix[indI - 1][indJ - 1] * transitition[1] );
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3];
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5];
} }
} }

View File

@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.LSV_ALLELES;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "5812da66811887d834d0379a33e655c0"); executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025");
} }
} }

View File

@ -49,7 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.*; import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.CEUTRIO_BAM;
import static org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperGeneralPloidyTestExecutor.NA12891_CALLS;
public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest { public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTest {
@ -57,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","3a321896c4b8b6457973c76c486da4d4"); executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716");
} }
@Test(enabled = true) @Test(enabled = true)

View File

@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -L 1:10,000,000-10,500,000", " -L 1:10,000,000-10,500,000",
1, 1,
Arrays.asList("1cb469b9cc8e6c70430021540bf1af8b")); Arrays.asList("51e022d07ead45a4e154f949b6642e84"));
executeTest(String.format("test indel caller in SLX"), spec); executeTest(String.format("test indel caller in SLX"), spec);
} }
@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" + " -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000", " -L 1:10,000,000-10,100,000",
1, 1,
Arrays.asList("c7e59f9ab718df4c604626a0f51af606")); Arrays.asList("1d9c6fda344eeee76cbe4221251dc341"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec); executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
} }
@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -L 1:10,000,000-10,500,000", " -L 1:10,000,000-10,500,000",
1, 1,
Arrays.asList("b6ad80cef63cab4f75fa4b1fb2517d1d")); Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e"));
executeTest(String.format("test indel calling, multiple technologies"), spec); executeTest(String.format("test indel calling, multiple technologies"), spec);
} }
@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("86880ec78755ae91cb5bb34a0631a32c")); Arrays.asList("3131cd7c49b623983a106db5228754b3"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
} }
@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("2584d5e3ade1b548f1fe9cdcafbe1b28")); Arrays.asList("273f5daa936e93da98efd6ceb37d7533"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
} }
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("939da0bb73b706badd8a0def7446b384")); Arrays.asList("00a003a0908281384e981294434a9f3e"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
} }
@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() { public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1, assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("556c214366e82e4682e753ce93307a4e")); Arrays.asList("87521a1bde124c7c5908ed067060fe45"));
executeTest("test minIndelFraction 0.0", spec); executeTest("test minIndelFraction 0.0", spec);
} }
@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() { public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1, assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("1df02b805d9dfbd532fa3632875a989d")); Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74"));
executeTest("test minIndelFraction 0.25", spec); executeTest("test minIndelFraction 0.25", spec);
} }

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("2f15ef1ead56d875a3f1d53772f52b3a")); Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af"));
executeTest("test MultiSample Pilot1", spec); executeTest("test MultiSample Pilot1", spec);
} }
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() { public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("39ec0b48cd51d797af7ed09cb9ba607e")); Arrays.asList("31c0f0074b3306b54170056e93b69e11"));
executeTest("test Multiple SNP alleles", spec); executeTest("test Multiple SNP alleles", spec);
} }
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() { public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("eb9604b77a7d6baab60c81ac3db5e47b")); Arrays.asList("753d6358b1634107de76900200116805"));
executeTest("test reverse trim", spec); executeTest("test reverse trim", spec);
} }
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() { public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("6b77b8f1002ec577bf0482fbe03222a4")); Arrays.asList("274eadae8a630a3fda9281d6d6253dea"));
executeTest("test mismatched PLs", spec); executeTest("test mismatched PLs", spec);
} }
} }

View File

@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test @Test
public void testReducedBamINDELs() { public void testReducedBamINDELs() {
testReducedCalling("INDEL", "9a702e7a85465f6c42d6c1828aee6c38"); testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180");
} }

View File

@ -49,10 +49,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.*;
import java.util.Arrays; import java.util.Arrays;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.NA12878_CHR20_BAM;
import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCallerIntegrationTest.REF;
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) { private void HCTestComplexVariants(String bam, String args, String md5) {
@ -63,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleComplex() { public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9fa4d3c88fd9c0f23c7a3ddd3d24a8c"); HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a898b551f78c71befee4d12070d3a788");
} }
private void HCTestSymbolicVariants(String bam, String args, String md5) { private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -87,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleGGAComplex() { public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"e8ffbfae3c1af5be02631a31f386a431"); "8a110549543412fa682419e9a8f0dd1d");
} }
@Test @Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"c3a98b19efa7cb36fe5f5f2ab893ef56"); "5429c234d471434adc09d9e60b87de24");
} }
} }

View File

@ -69,12 +69,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSample() { public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "45856ad67bfe8d8bea45808d8258bcf1"); HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c");
} }
@Test @Test
public void testHaplotypeCallerSingleSample() { public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "b6c93325f851ac358ea49260fb11b75c"); HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478");
} }
@Test(enabled = false) // can't annotate the rsID's yet @Test(enabled = false) // can't annotate the rsID's yet
@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSampleGGA() { public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"4ca6b560d0569cdca400d3e50915e211"); "70bd5d0805bf6f51e5f61b377526c979");
} }
private void HCTestIndelQualityScores(String bam, String args, String md5) { private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() { public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "5d06ec5502d3f157964bd7b275d6a0cb"); HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa");
} }
@Test @Test
@ -111,14 +111,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestProblematicReadsModifiedInActiveRegions() { public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("53a50dae68f0175ca3088dea1d3bb881")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
} }
@Test @Test
public void HCTestStructuralIndels() { public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("d3bc6adde8cd9514ae5c49cd366d5de4")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300"));
executeTest("HCTestStructuralIndels: ", spec); executeTest("HCTestStructuralIndels: ", spec);
} }
@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() { public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("4adb833ed8af20224b76bba61e2b0d93")); Arrays.asList("f0a215faed194dc160f19e26293e85f8"));
executeTest("HC calling on a ReducedRead BAM", spec); executeTest("HC calling on a ReducedRead BAM", spec);
} }
@ -148,7 +148,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() { public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("1704b0901c86f8f597d931222d5c8dd8")); Arrays.asList("bea274584344fa6b4b0f98eee327bad8"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
} }
} }

View File

@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) { for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct });
} }
return tests.toArray(new Object[][]{}); return tests.toArray(new Object[][]{});

View File

@ -70,7 +70,7 @@ public class PairHMMUnitTest extends BaseTest {
final static boolean EXTENSIVE_TESTING = true; final static boolean EXTENSIVE_TESTING = true;
final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation
final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation
final PairHMM loglessHMM = new LoglessCachingPairHMM(); final PairHMM loglessHMM = new LoglessPairHMM();
private List<PairHMM> getHMMs() { private List<PairHMM> getHMMs() {
return Arrays.asList(exactHMM, originalHMM, loglessHMM); return Arrays.asList(exactHMM, originalHMM, loglessHMM);
@ -116,13 +116,12 @@ public class PairHMMUnitTest extends BaseTest {
return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual);
} }
public double expectedLogL(final PairHMM hmm) { public double expectedLogL() {
return (expectedQual / -10.0) + 0.03 + return (expectedQual / -10.0) + 0.03 + Math.log10(1.0/refBasesWithContext.length);
hmm.getNPotentialXStartsLikelihoodPenaltyLog10(refBasesWithContext.length, readBasesWithContext.length);
} }
public double getTolerance(final PairHMM hmm) { public double getTolerance(final PairHMM hmm) {
if ( hmm instanceof LoglessCachingPairHMM ) if ( hmm instanceof LoglessPairHMM)
return toleranceFromExact(); return toleranceFromExact();
if ( hmm instanceof Log10PairHMM ) { if ( hmm instanceof Log10PairHMM ) {
return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference(); return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference();
@ -150,7 +149,7 @@ public class PairHMMUnitTest extends BaseTest {
qualAsBytes(gcp, false, anchorIndel), 0, true); qualAsBytes(gcp, false, anchorIndel), 0, true);
} }
private final byte[] asBytes(final String bases, final boolean left, final boolean right) { private byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
} }
@ -163,7 +162,7 @@ public class PairHMMUnitTest extends BaseTest {
// update just the bases corresponding to the provided micro read with the quality scores // update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) { if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual; phredQuals[CONTEXT.length()] = (byte)phredQual;
} else { } else {
for ( int i = 0; i < read.length(); i++) for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual; phredQuals[i + CONTEXT.length()] = (byte)phredQual;
@ -270,7 +269,7 @@ public class PairHMMUnitTest extends BaseTest {
final double exactLogL = cfg.calcLogL( exactHMM, true ); final double exactLogL = cfg.calcLogL( exactHMM, true );
for ( final PairHMM hmm : getHMMs() ) { for ( final PairHMM hmm : getHMMs() ) {
double actualLogL = cfg.calcLogL( hmm, true ); double actualLogL = cfg.calcLogL( hmm, true );
double expectedLogL = cfg.expectedLogL(hmm); double expectedLogL = cfg.expectedLogL();
// compare to our theoretical expectation with appropriate tolerance // compare to our theoretical expectation with appropriate tolerance
Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm);
@ -322,8 +321,8 @@ public class PairHMMUnitTest extends BaseTest {
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
// - log10 is because of number of start positions final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2); Assert.assertEquals(res1, expected, 1e-2);
} }
} }
@ -354,8 +353,8 @@ public class PairHMMUnitTest extends BaseTest {
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
// - log10 is because of number of start positions final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, -2.0 - Math.log10(originalHMM.getNPotentialXStarts(haplotype1.length, mread.length)), 1e-2); Assert.assertEquals(res1, expected, 1e-2);
} }
} }
@ -406,8 +405,14 @@ public class PairHMMUnitTest extends BaseTest {
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), 0, true);
final double expected = Math.log10(Math.pow(1.0 - QualityUtils.qualToErrorProb(baseQual), readBases.length)); double expected = 0;
Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read"); final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length;
if (readBases.length < refBases.length) {
expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), readBases.length));
} else if (readBases.length > refBases.length) {
expected = Math.log10(initialCondition * Math.pow(QualityUtils.qualToProb(baseQual), refBases.length) * Math.pow(QualityUtils.qualToErrorProb(insQual), readBases.length - refBases.length));
}
Assert.assertEquals(d, expected, 1e-3, "Likelihoods should sum to just the error prob of the read " + String.format("readSize=%d refSize=%d", readSize, refSize));
} }
@DataProvider(name = "HMMProviderWithBigReads") @DataProvider(name = "HMMProviderWithBigReads")
@ -472,7 +477,7 @@ public class PairHMMUnitTest extends BaseTest {
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), 0, true);
loglessHMM.dumpMatrices(); // loglessHMM.dumpMatrices();
} }
@DataProvider(name = "JustHMMProvider") @DataProvider(name = "JustHMMProvider")
@ -610,7 +615,7 @@ public class PairHMMUnitTest extends BaseTest {
public Object[][] makeUninitializedHMMs() { public Object[][] makeUninitializedHMMs() {
List<Object[]> tests = new ArrayList<Object[]>(); List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new LoglessCachingPairHMM()}); tests.add(new Object[]{new LoglessPairHMM()});
tests.add(new Object[]{new Log10PairHMM(true)}); tests.add(new Object[]{new Log10PairHMM(true)});
return tests.toArray(new Object[][]{}); return tests.toArray(new Object[][]{});

View File

@ -67,10 +67,10 @@ public class Log10PairHMM extends PairHMM {
public void initialize( final int haplotypeMaxLength, final int readMaxLength) { public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
super.initialize(haplotypeMaxLength, readMaxLength); super.initialize(haplotypeMaxLength, readMaxLength);
for( int iii=0; iii < X_METRIC_MAX_LENGTH; iii++ ) { for( int iii=0; iii < paddedMaxReadLength; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
} }
} }
@ -88,7 +88,8 @@ public class Log10PairHMM extends PairHMM {
final boolean recacheReadValues ) { final boolean recacheReadValues ) {
// the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual // the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual
// read and haplotypes, not the maximum // read and haplotypes, not the maximum
matchMetricArray[1][1] = getNPotentialXStartsLikelihoodPenaltyLog10(haplotypeBases.length, readBases.length); final double initialValue = Math.log10((double) 1/haplotypeBases.length);
matchMatrix[1][1] = initialValue;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2; final int X_METRIC_LENGTH = readBases.length + 2;
@ -104,14 +105,17 @@ public class Log10PairHMM extends PairHMM {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; } if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray); matchMatrix, insertionMatrix, deletionMatrix);
} }
} }
// final probability is the log10 sum of the last element in all three state arrays // final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1; final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1; double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]});
return myLog10SumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); for (int j = 2; j < Y_METRIC_LENGTH; j++)
result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]});
return result;
} }
/** /**
@ -134,7 +138,7 @@ public class Log10PairHMM extends PairHMM {
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1; final int im1 = indI - 1;
@ -151,18 +155,18 @@ public class Log10PairHMM extends PairHMM {
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ - 1] + d0, XMetricArray[indI - 1][indJ - 1] + e0, YMetricArray[indI - 1][indJ - 1] + e0}); matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0});
// update the X (insertion) array // update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI - 1][indJ] + d1, XMetricArray[indI - 1][indJ] + e1}); insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1});
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]);
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]);
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMetricArray[indI][indJ - 1] + d2, YMetricArray[indI][indJ - 1] + e2}); deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2});
} }
} }

View File

@ -25,10 +25,12 @@
package org.broadinstitute.sting.utils.pairhmm; package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
/** /**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
@ -52,11 +54,11 @@ public abstract class PairHMM {
LOGLESS_CACHING LOGLESS_CACHING
} }
protected double[][] matchMetricArray = null; protected double[][] matchMatrix = null;
protected double[][] XMetricArray = null; protected double[][] insertionMatrix = null;
protected double[][] YMetricArray = null; protected double[][] deletionMatrix = null;
protected int maxHaplotypeLength, maxReadLength; protected int maxHaplotypeLength, maxReadLength;
protected int X_METRIC_MAX_LENGTH, Y_METRIC_MAX_LENGTH; protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
private boolean initialized = false; private boolean initialized = false;
/** /**
@ -72,12 +74,12 @@ public abstract class PairHMM {
maxReadLength = readMaxLength; maxReadLength = readMaxLength;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
X_METRIC_MAX_LENGTH = readMaxLength + 2; paddedMaxReadLength = readMaxLength + 2;
Y_METRIC_MAX_LENGTH = haplotypeMaxLength + 2; paddedMaxHaplotypeLength = haplotypeMaxLength + 2;
matchMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
XMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
YMetricArray = new double[X_METRIC_MAX_LENGTH][Y_METRIC_MAX_LENGTH]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
initialized = true; initialized = true;
} }
@ -124,21 +126,17 @@ public abstract class PairHMM {
double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues);
// TODO -- remove max when PairHMM no longer returns likelihoods >= 0
result = Math.min(result, 0.0);
if ( MathUtils.goodLog10Probability(result) ) if ( MathUtils.goodLog10Probability(result) )
return result; return result;
else else
throw new IllegalStateException("Bad likelihoods detected: " + result); throw new ReviewedStingException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
// return result;
} }
/** /**
* To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10
*/ */
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"})
protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases, final byte[] readBases,
final byte[] readQuals, final byte[] readQuals,
@ -148,41 +146,13 @@ public abstract class PairHMM {
final int hapStartIndex, final int hapStartIndex,
final boolean recacheReadValues ); final boolean recacheReadValues );
/**
* How many potential starting locations are a read with readSize bases against a haplotype with haplotypeSize bases?
*
* for example, a 3 bp read against a 5 bp haplotype could potentially start at 1, 2, 3 = 5 - 3 + 1 = 3
* the max value is necessary in the case where the read is longer than the haplotype, in which case
* there's a single unique start site by assumption
*
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @return a positive integer >= 1
*/
@Ensures("result >= 1")
protected int getNPotentialXStarts(final int haplotypeSize, final int readSize) {
return Math.max(haplotypeSize - readSize + 1, 1);
}
/**
* The the log10 probability penalty for the number of potential start sites of the read aginst the haplotype
*
* @param haplotypeSize the number of bases in the haplotype we are testing
* @param readSize the number of bases in the read we are testing
* @return a log10 probability
*/
@Ensures("MathUtils.goodLog10Probability(result)")
protected double getNPotentialXStartsLikelihoodPenaltyLog10(final int haplotypeSize, final int readSize) {
return - Math.log10(getNPotentialXStarts(haplotypeSize, readSize));
}
/** /**
* Print out the core hmm matrices for debugging * Print out the core hmm matrices for debugging
*/ */
protected void dumpMatrices() { protected void dumpMatrices() {
dumpMatrix("matchMetricArray", matchMetricArray); dumpMatrix("matchMetricArray", matchMatrix);
dumpMatrix("XMetricArray", XMetricArray); dumpMatrix("insertionMatrix", insertionMatrix);
dumpMatrix("YMetricArray", YMetricArray); dumpMatrix("deletionMatrix", deletionMatrix);
} }
/** /**