Merge pull request #148 from broadinstitute/mc_fix_hmm_caching

Fix caching indices in the PairHMM
This commit is contained in:
delangel 2013-04-08 08:09:13 -07:00
commit 4cc4bb36aa
15 changed files with 306 additions and 286 deletions

View File

@ -106,12 +106,8 @@ public class LikelihoodCalculationEngine {
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
} }
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
X_METRIC_LENGTH += 2;
Y_METRIC_LENGTH += 2;
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
// for each sample's reads // for each sample's reads
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) { for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
@ -134,7 +130,6 @@ public class LikelihoodCalculationEngine {
for( final GATKSAMRecord read : reads ) { for( final GATKSAMRecord read : reads ) {
final byte[] overallGCP = new byte[read.getReadLength()]; final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
Haplotype previousHaplotypeSeen = null;
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone(); final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities();
@ -149,14 +144,9 @@ public class LikelihoodCalculationEngine {
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
final Haplotype haplotype = haplotypes.get(jjj); final Haplotype haplotype = haplotypes.get(jjj);
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
final boolean isFirstHaplotype = jjj == 0; final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
read.getReadBases(), readQuals, readInsQuals, readDelQuals, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
overallGCP, haplotypeStart, isFirstHaplotype);
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
} }

View File

@ -349,7 +349,6 @@ public class PairHMMIndelErrorModel {
int j=0; int j=0;
byte[] previousHaplotypeSeen = null;
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
@ -392,34 +391,25 @@ public class PairHMMIndelErrorModel {
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
(int)indStart, (int)indStop); (int)indStart, (int)indStop);
final int X_METRIC_LENGTH = readBases.length+2; if (firstHap) {
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
if (previousHaplotypeSeen == null) {
//no need to reallocate arrays for each new haplotype, as length won't change //no need to reallocate arrays for each new haplotype, as length won't change
pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH); pairHMM.initialize(readBases.length, haplotypeBases.length);
firstHap = false;
} }
int startIndexInHaplotype = 0;
if (previousHaplotypeSeen != null)
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
previousHaplotypeSeen = haplotypeBases.clone();
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
baseInsertionQualities, baseDeletionQualities, baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap);
contextLogGapContinuationProbabilities, startIndexInHaplotype, firstHap);
if (DEBUG) { if (DEBUG) {
System.out.println("H:"+new String(haplotypeBases)); System.out.println("H:"+new String(haplotypeBases));
System.out.println("R:"+new String(readBases)); System.out.println("R:"+new String(readBases));
System.out.format("L:%4.2f\n",readLikelihood); System.out.format("L:%4.2f\n",readLikelihood);
System.out.format("StPos:%d\n", startIndexInHaplotype);
} }
perReadAlleleLikelihoodMap.add(p, a, readLikelihood); perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
readLikelihoods[readIdx][j++] = readLikelihood; readLikelihoods[readIdx][j++] = readLikelihood;
firstHap = false;
} }
} }
} }

View File

@ -59,16 +59,20 @@ public final class LoglessPairHMM extends PairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0; protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10); protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10);
double[][] transition = null; // The cache private static final int matchToMatch = 0;
double[][] prior = null; // The cache private static final int indelToMatch = 1;
boolean constantsAreInitialized = false; private static final int matchToInsertion = 2;
private static final int insertionToInsertion = 3;
private static final int matchToDeletion = 4;
private static final int deletionToDeletion = 5;
/** /**
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public void initialize( final int haplotypeMaxLength, final int readMaxLength) { public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(haplotypeMaxLength, readMaxLength); super.initialize(readMaxLength, haplotypeMaxLength);
transition = new double[paddedMaxReadLength][6]; transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
@ -86,18 +90,22 @@ public final class LoglessPairHMM extends PairHMM {
final byte[] overallGCP, final byte[] overallGCP,
final int hapStartIndex, final int hapStartIndex,
final boolean recacheReadValues ) { final boolean recacheReadValues ) {
if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
final double initialValue = INITIAL_CONDITION / haplotypeBases.length;
// set the initial value (free deletions in the beginning) for the first row in the deletion matrix
for( int j = 0; j < paddedHaplotypeLength; j++ ) {
deletionMatrix[0][j] = initialValue;
}
}
if ( ! constantsAreInitialized || recacheReadValues ) if ( ! constantsAreInitialized || recacheReadValues )
initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP); initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
// NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this for (int i = 1; i < paddedReadLength; i++) {
// read and haplotype lengths, not the max lengths
final int readXMetricLength = readBases.length + 2;
final int hapYMetricLength = haplotypeBases.length + 2;
for (int i = 2; i < readXMetricLength; i++) {
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
for (int j = hapStartIndex+1; j < hapYMetricLength; j++) { for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) {
updateCell(i, j, prior[i][j], transition[i]); updateCell(i, j, prior[i][j], transition[i]);
} }
} }
@ -105,9 +113,9 @@ public final class LoglessPairHMM extends PairHMM {
// final probability is the log10 sum of the last element in the Match and Insertion state arrays // final probability is the log10 sum of the last element in the Match and Insertion state arrays
// this way we ignore all paths that ended in deletions! (huge) // this way we ignore all paths that ended in deletions! (huge)
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
final int endI = readXMetricLength - 1; final int endI = paddedReadLength - 1;
double finalSumProbabilities = 0.0; double finalSumProbabilities = 0.0;
for (int j = 0; j < hapYMetricLength; j++) { for (int j = 1; j < paddedHaplotypeLength; j++) {
finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j]; finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j];
} }
return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10; return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10;
@ -132,7 +140,7 @@ public final class LoglessPairHMM extends PairHMM {
final byte qual = readQuals[i]; final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) { for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j]; final byte y = haplotypeBases[j];
prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
} }
} }
@ -151,25 +159,15 @@ public final class LoglessPairHMM extends PairHMM {
"overallGCP != null" "overallGCP != null"
}) })
@Ensures("constantsAreInitialized") @Ensures("constantsAreInitialized")
private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
// the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init for (int i = 0; i < insertionGOP.length; i++) {
final double initialValue = INITIAL_CONDITION / haplotypeLength;
matchMatrix[1][1] = initialValue;
// fill in the first row
for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) {
deletionMatrix[1][jjj] = initialValue;
}
final int l = insertionGOP.length;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP);
transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]);
transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]);
transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
} }
// note that we initialized the constants // note that we initialized the constants
@ -185,14 +183,14 @@ public final class LoglessPairHMM extends PairHMM {
* @param indI row index in the matrices to update * @param indI row index in the matrices to update
* @param indJ column index in the matrices to update * @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype * @param prior the likelihood editing distance matrix for the read x haplotype
* @param transitition an array with the six transitition relevant to this location * @param transition an array with the six transition relevant to this location
*/ */
private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) { private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] + matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] +
insertionMatrix[indI - 1][indJ - 1] * transitition[1] + insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] +
deletionMatrix[indI - 1][indJ - 1] * transitition[1] ); deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] );
insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3]; insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion];
deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5]; deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion];
} }
} }

View File

@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025"); executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "603416111f34e2a735163fa97e1a8272");
} }
} }

View File

@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716"); executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","13de8558acaa0b9082f2df477b45de9b");
} }
@Test(enabled = true) @Test(enabled = true)

View File

@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -L 1:10,000,000-10,500,000", " -L 1:10,000,000-10,500,000",
1, 1,
Arrays.asList("51e022d07ead45a4e154f949b6642e84")); Arrays.asList("118ed5b54fc9ce1cde89f06a20afebef"));
executeTest(String.format("test indel caller in SLX"), spec); executeTest(String.format("test indel caller in SLX"), spec);
} }
@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" + " -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000", " -L 1:10,000,000-10,100,000",
1, 1,
Arrays.asList("1d9c6fda344eeee76cbe4221251dc341")); Arrays.asList("6ef59013331bc031ea37807b325d7d2c"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec); executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
} }
@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -L 1:10,000,000-10,500,000", " -L 1:10,000,000-10,500,000",
1, 1,
Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e")); Arrays.asList("dd3ee4675377191e34aaf67335e0219a"));
executeTest(String.format("test indel calling, multiple technologies"), spec); executeTest(String.format("test indel calling, multiple technologies"), spec);
} }
@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("3131cd7c49b623983a106db5228754b3")); Arrays.asList("bb06ef8262f91664b7d2fe7e1e5df195"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
} }
@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("273f5daa936e93da98efd6ceb37d7533")); Arrays.asList("0a2a8cc2d1a79e84624836a31de5491c"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
} }
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("00a003a0908281384e981294434a9f3e")); Arrays.asList("939f80c6d2dfb592956aed3bdeaf319d"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
} }
@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() { public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1, assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("87521a1bde124c7c5908ed067060fe45")); Arrays.asList("fc937f92e59dfe07b894411b5dfc166a"));
executeTest("test minIndelFraction 0.0", spec); executeTest("test minIndelFraction 0.0", spec);
} }
@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() { public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1, assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74")); Arrays.asList("41ad9e0edca4b9987390ba5c07f39e4a"));
executeTest("test minIndelFraction 0.25", spec); executeTest("test minIndelFraction 0.25", spec);
} }

View File

@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -L 1:10,000,000-10,100,000", " -L 1:10,000,000-10,100,000",
1, 1,
Arrays.asList("3a805f5b823ccac19aaec01a3016100e")); Arrays.asList("0a4a78da876bfa3d42170249a94357b4"));
executeTest(String.format("test multiple technologies"), spec); executeTest(String.format("test multiple technologies"), spec);
} }
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" + " -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY", " -baq CALCULATE_AS_NECESSARY",
1, 1,
Arrays.asList("25aa0259876692dc3c848a37369bac6a")); Arrays.asList("89182fd4d9532ab4b2a0a84bfb557089"));
executeTest(String.format("test calling with BAQ"), spec); executeTest(String.format("test calling with BAQ"), spec);
} }

View File

@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af")); Arrays.asList("52b6086f4597da5b35ab902bea4066fc"));
executeTest("test MultiSample Pilot1", spec); executeTest("test MultiSample Pilot1", spec);
} }
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultipleSNPAlleles() { public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("31c0f0074b3306b54170056e93b69e11")); Arrays.asList("28bfbff3da3af43d6a1eff673e5cb0f8"));
executeTest("test Multiple SNP alleles", spec); executeTest("test Multiple SNP alleles", spec);
} }
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testReverseTrim() { public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("753d6358b1634107de76900200116805")); Arrays.asList("a9edd04374ee9c42970291f39a50c191"));
executeTest("test reverse trim", spec); executeTest("test reverse trim", spec);
} }
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMismatchedPLs() { public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("274eadae8a630a3fda9281d6d6253dea")); Arrays.asList("6fc32ca9de769060f3c2a3d94f8f2f91"));
executeTest("test mismatched PLs", spec); executeTest("test mismatched PLs", spec);
} }
} }

View File

@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test @Test
public void testReducedBamINDELs() { public void testReducedBamINDELs() {
testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180"); testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
} }

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleComplex() { public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "73817d9173b8d9d05dac1f3092871f33"); HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7b67ac6213b7a6f759057fb9d7148fdc");
} }
private void HCTestSymbolicVariants(String bam, String args, String md5) { private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test @Test
public void testHaplotypeCallerMultiSampleGGAComplex() { public void testHaplotypeCallerMultiSampleGGAComplex() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
"8a110549543412fa682419e9a8f0dd1d"); "eb41ed6f1d692368a0f67311d139a38a");
} }
@Test @Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"5429c234d471434adc09d9e60b87de24"); "c4c33c962aca12c51def9b8cde35b7d2");
} }
} }

View File

@ -47,15 +47,12 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller; package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.TribbleIndexedFeatureReader;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.variantcontext.VariantContext; import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.File; import java.io.File;
@ -80,12 +77,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSample() { public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c"); HCTest(CEUTRIO_BAM, "", "f132843e3c8e065a783cc4fdf9ee5df3");
} }
@Test @Test
public void testHaplotypeCallerSingleSample() { public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478"); HCTest(NA12878_BAM, "", "15e0201f5c478310d278d2d03483c152");
} }
@Test(enabled = false) // can't annotate the rsID's yet @Test(enabled = false) // can't annotate the rsID's yet
@ -96,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSampleGGA() { public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"70bd5d0805bf6f51e5f61b377526c979"); "48d309aed0cdc40cc983eeb5a8d12f53");
} }
@Test @Test
@ -112,7 +109,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() { public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa"); HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "34c7fcfe17a1d835e2dc403df9eb3591");
} }
private void HCTestNearbySmallIntervals(String bam, String args, String md5) { private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
@ -149,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerNearbySmallIntervals() { public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7"); HCTestNearbySmallIntervals(NA12878_BAM, "", "eae65d20836d6c6ebca9e25e33566f74");
} }
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -159,14 +156,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestProblematicReadsModifiedInActiveRegions() { public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a3d74040a4966bf7a04cbd4924970685"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
} }
@Test @Test
public void HCTestStructuralIndels() { public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("40da88ed3722c512264b72db37f18720"));
executeTest("HCTestStructuralIndels: ", spec); executeTest("HCTestStructuralIndels: ", spec);
} }
@ -188,7 +185,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestReducedBam() { public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("f0a215faed194dc160f19e26293e85f8")); Arrays.asList("69b83d578c14ed32d08ce4e7ff8a8a18"));
executeTest("HC calling on a ReducedRead BAM", spec); executeTest("HC calling on a ReducedRead BAM", spec);
} }
@ -196,7 +193,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void testReducedBamWithReadsNotFullySpanningDeletion() { public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("bea274584344fa6b4b0f98eee327bad8")); Arrays.asList("0cae60d86a3f86854699217a30ece3e3"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
} }
} }

View File

@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) { for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct }); tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct });
} }
return tests.toArray(new Object[][]{}); return tests.toArray(new Object[][]{});

View File

@ -142,11 +142,11 @@ public class PairHMMUnitTest extends BaseTest {
} }
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
pairHMM.initialize(refBasesWithContext.length, readBasesWithContext.length); pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
refBasesWithContext, readBasesWithContext, refBasesWithContext, readBasesWithContext,
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
qualAsBytes(gcp, false, anchorIndel), 0, true); qualAsBytes(gcp, false, anchorIndel), true);
} }
private byte[] asBytes(final String bases, final boolean left, final boolean right) { private byte[] asBytes(final String bases, final boolean left, final boolean right) {
@ -268,8 +268,8 @@ public class PairHMMUnitTest extends BaseTest {
if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) { if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
final double exactLogL = cfg.calcLogL( exactHMM, true ); final double exactLogL = cfg.calcLogL( exactHMM, true );
for ( final PairHMM hmm : getHMMs() ) { for ( final PairHMM hmm : getHMMs() ) {
double actualLogL = cfg.calcLogL( hmm, true ); final double actualLogL = cfg.calcLogL( hmm, true );
double expectedLogL = cfg.expectedLogL(); final double expectedLogL = cfg.expectedLogL();
// compare to our theoretical expectation with appropriate tolerance // compare to our theoretical expectation with appropriate tolerance
Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm); Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm);
@ -283,10 +283,10 @@ public class PairHMMUnitTest extends BaseTest {
@Test(enabled = !DEBUG, dataProvider = "OptimizedLikelihoodTestProvider") @Test(enabled = !DEBUG, dataProvider = "OptimizedLikelihoodTestProvider")
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) { if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
double exactLogL = cfg.calcLogL( exactHMM, false ); final double exactLogL = cfg.calcLogL( exactHMM, false );
for ( final PairHMM hmm : getHMMs() ) { for ( final PairHMM hmm : getHMMs() ) {
double calculatedLogL = cfg.calcLogL( hmm, false ); final double calculatedLogL = cfg.calcLogL( hmm, false );
// compare to the exact reference implementation with appropriate tolerance // compare to the exact reference implementation with appropriate tolerance
Assert.assertEquals(calculatedLogL, exactLogL, cfg.getTolerance(hmm), String.format("Test: logL calc=%.2f expected=%.2f for %s with hmm %s", calculatedLogL, exactLogL, cfg.toString(), hmm)); Assert.assertEquals(calculatedLogL, exactLogL, cfg.getTolerance(hmm), String.format("Test: logL calc=%.2f expected=%.2f for %s with hmm %s", calculatedLogL, exactLogL, cfg.toString(), hmm));
Assert.assertTrue(MathUtils.goodLog10Probability(calculatedLogL), "Bad log10 likelihood " + calculatedLogL); Assert.assertTrue(MathUtils.goodLog10Probability(calculatedLogL), "Bad log10 likelihood " + calculatedLogL);
@ -296,64 +296,55 @@ public class PairHMMUnitTest extends BaseTest {
@Test(enabled = !DEBUG) @Test(enabled = !DEBUG)
public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() {
byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
final byte matchQual = 90;
final byte mismatchQual = 20;
final byte indelQual = 80;
final int offset = 2; final int offset = 2;
byte[] gop = new byte[haplotype1.length - 2 * offset]; final byte[] gop = new byte[haplotype1.length - 2 * offset];
Arrays.fill(gop, (byte) 80); Arrays.fill(gop, indelQual);
byte[] gcp = new byte[haplotype1.length - 2 * offset]; final byte[] gcp = new byte[haplotype1.length - 2 * offset];
Arrays.fill(gcp, (byte) 80); Arrays.fill(gcp, indelQual);
loglessHMM.initialize(gop.length, haplotype1.length);
for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) { for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) {
byte[] quals = new byte[haplotype1.length - 2 * offset]; final byte[] quals = new byte[haplotype1.length - 2 * offset];
Arrays.fill(quals, (byte) 90); Arrays.fill(quals, matchQual);
// one read mismatches the haplotype // one base mismatches the haplotype
quals[k] = 20; quals[k] = mismatchQual;
final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
// change single base at position k to C. If it's a C, change to T // change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
originalHMM.initialize(haplotype1.length, mread.length); final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
haplotype1, mread,
quals, gop, gop,
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, expected, 1e-2); Assert.assertEquals(res1, expected, 1e-2);
} }
} }
@Test(enabled = ! DEBUG) @Test(enabled = ! DEBUG)
public void testMismatchInEveryPositionInTheRead() { public void testMismatchInEveryPositionInTheRead() {
byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
final byte matchQual = 90;
final byte mismatchQual = 20;
final byte indelQual = 80;
final int offset = 2; final int offset = 2;
byte[] gop = new byte[haplotype1.length - offset]; final byte[] gop = new byte[haplotype1.length - offset];
Arrays.fill(gop, (byte) 80); Arrays.fill(gop, indelQual);
byte[] gcp = new byte[haplotype1.length - offset]; final byte[] gcp = new byte[haplotype1.length - offset];
Arrays.fill(gcp, (byte) 80); Arrays.fill(gcp, indelQual);
loglessHMM.initialize(gop.length, haplotype1.length);
for( int k = 0; k < haplotype1.length - offset; k++ ) { for( int k = 0; k < haplotype1.length - offset; k++ ) {
byte[] quals = new byte[haplotype1.length - offset]; final byte[] quals = new byte[haplotype1.length - offset];
Arrays.fill(quals, (byte) 90); Arrays.fill(quals, matchQual);
// one read mismatches the haplotype // one base mismatches the haplotype with low qual
quals[k] = 20; quals[k] = mismatchQual;
final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
// change single base at position k to C. If it's a C, change to T // change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
originalHMM.initialize(haplotype1.length, mread.length); final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
haplotype1, mread,
quals, gop, gop,
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
Assert.assertEquals(res1, expected, 1e-2); Assert.assertEquals(res1, expected, 1e-2);
} }
} }
@ -376,35 +367,35 @@ public class PairHMMUnitTest extends BaseTest {
@Test(enabled = !DEBUG, dataProvider = "HMMProvider") @Test(enabled = !DEBUG, dataProvider = "HMMProvider")
void testMultipleReadMatchesInHaplotype(final PairHMM hmm, final int readSize, final int refSize) { void testMultipleReadMatchesInHaplotype(final PairHMM hmm, final int readSize, final int refSize) {
byte[] readBases = Utils.dupBytes((byte)'A', readSize); final byte[] readBases = Utils.dupBytes((byte)'A', readSize);
byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes(); final byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes();
byte baseQual = 20; final byte baseQual = 20;
byte insQual = 37; final byte insQual = 37;
byte delQual = 37; final byte delQual = 37;
byte gcp = 10; final byte gcp = 10;
hmm.initialize(refBases.length, readBases.length); hmm.initialize(readBases.length, refBases.length);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(baseQual, readBases.length),
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), true);
Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d); Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d);
} }
@Test(enabled = !DEBUG, dataProvider = "HMMProvider") @Test(enabled = !DEBUG, dataProvider = "HMMProvider")
void testAllMatchingRead(final PairHMM hmm, final int readSize, final int refSize) { void testAllMatchingRead(final PairHMM hmm, final int readSize, final int refSize) {
byte[] readBases = Utils.dupBytes((byte)'A', readSize); final byte[] readBases = Utils.dupBytes((byte)'A', readSize);
byte[] refBases = Utils.dupBytes((byte)'A', refSize); final byte[] refBases = Utils.dupBytes((byte)'A', refSize);
byte baseQual = 20; final byte baseQual = 20;
byte insQual = 100; final byte insQual = 100;
byte delQual = 100; final byte delQual = 100;
byte gcp = 100; final byte gcp = 100;
hmm.initialize(refBases.length, readBases.length); hmm.initialize(readBases.length, refBases.length);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(baseQual, readBases.length),
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), true);
double expected = 0; double expected = 0;
final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length; final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length;
if (readBases.length < refBases.length) { if (readBases.length < refBases.length) {
@ -439,45 +430,42 @@ public class PairHMMUnitTest extends BaseTest {
@Test(enabled = !DEBUG, dataProvider = "HMMProviderWithBigReads") @Test(enabled = !DEBUG, dataProvider = "HMMProviderWithBigReads")
void testReallyBigReads(final PairHMM hmm, final String read, final String ref) { void testReallyBigReads(final PairHMM hmm, final String read, final String ref) {
byte[] readBases = read.getBytes(); final byte[] readBases = read.getBytes();
byte[] refBases = ref.getBytes(); final byte[] refBases = ref.getBytes();
byte baseQual = 30; final byte baseQual = 30;
byte insQual = 40; final byte insQual = 40;
byte delQual = 40; final byte delQual = 40;
byte gcp = 10; final byte gcp = 10;
hmm.initialize(refBases.length, readBases.length); hmm.initialize(readBases.length, refBases.length);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(baseQual, readBases.length),
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), true);
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + read.length() + " bases and ref with " + ref.length() + " bases");
} }
@Test(enabled = !DEBUG) @Test(enabled = !DEBUG)
void testPreviousBadValue() { void testPreviousBadValue() {
byte[] readBases = "A".getBytes(); final byte[] readBases = "A".getBytes();
byte[] refBases = "AT".getBytes(); final byte[] refBases = "AT".getBytes();
byte baseQual = 30; final byte baseQual = 30;
byte insQual = 40; final byte insQual = 40;
byte delQual = 40; final byte delQual = 40;
byte gcp = 10; final byte gcp = 10;
exactHMM.initialize(refBases.length, readBases.length); exactHMM.initialize(readBases.length, refBases.length);
double d = exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(baseQual, readBases.length),
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), true);
//exactHMM.dumpMatrices();
loglessHMM.initialize(refBases.length, readBases.length); loglessHMM.initialize(readBases.length, refBases.length);
double logless = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(baseQual, readBases.length),
Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(insQual, readBases.length),
Utils.dupBytes(delQual, readBases.length), Utils.dupBytes(delQual, readBases.length),
Utils.dupBytes(gcp, readBases.length), 0, true); Utils.dupBytes(gcp, readBases.length), true);
// loglessHMM.dumpMatrices();
} }
@DataProvider(name = "JustHMMProvider") @DataProvider(name = "JustHMMProvider")
@ -493,25 +481,16 @@ public class PairHMMUnitTest extends BaseTest {
@Test(enabled = !DEBUG, dataProvider = "JustHMMProvider") @Test(enabled = !DEBUG, dataProvider = "JustHMMProvider")
void testMaxLengthsBiggerThanProvidedRead(final PairHMM hmm) { void testMaxLengthsBiggerThanProvidedRead(final PairHMM hmm) {
final byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes();
final byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
final byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
final byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
final byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length);
hmm.initialize(readBases.length + 100, refBases.length + 100);
for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) { for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) {
byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes(); hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true);
byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
byte gcp = 10;
byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
final int maxHaplotypeLength = refBases.length + nExtraMaxSize;
final int maxReadLength = readBases.length + nExtraMaxSize;
hmm.initialize(maxHaplotypeLength, maxReadLength);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
quals,
insQual,
delQual,
Utils.dupBytes(gcp, readBases.length), 0, true);
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + readBases.length + " bases and ref with " + refBases.length + " bases");
} }
} }
@ -551,7 +530,7 @@ public class PairHMMUnitTest extends BaseTest {
final int maxHaplotypeLength = prefix.length() + root1.length(); final int maxHaplotypeLength = prefix.length() + root1.length();
// the initialization occurs once, at the start of the evalution of reads // the initialization occurs once, at the start of the evalution of reads
hmm.initialize(maxHaplotypeLength, maxReadLength); hmm.initialize(maxReadLength, maxHaplotypeLength);
for ( int prefixStart = prefix.length(); prefixStart >= 0; prefixStart-- ) { for ( int prefixStart = prefix.length(); prefixStart >= 0; prefixStart-- ) {
final String myPrefix = prefix.substring(prefixStart, prefix.length()); final String myPrefix = prefix.substring(prefixStart, prefix.length());
@ -574,9 +553,7 @@ public class PairHMMUnitTest extends BaseTest {
final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length); final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length);
final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length); final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length);
final byte[] gcp = Utils.dupBytes((byte)10, readBases.length); final byte[] gcp = Utils.dupBytes((byte)10, readBases.length);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache);
hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp,
hapStart, recache);
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart); Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart);
return d; return d;
} }
@ -629,7 +606,7 @@ public class PairHMMUnitTest extends BaseTest {
// didn't call initialize => should exception out // didn't call initialize => should exception out
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
baseQuals, baseQuals, baseQuals, baseQuals, 0, true); baseQuals, baseQuals, baseQuals, baseQuals, true);
} }
@Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
@ -640,7 +617,7 @@ public class PairHMMUnitTest extends BaseTest {
hmm.initialize(3, 3); hmm.initialize(3, 3);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
baseQuals, baseQuals, baseQuals, baseQuals, 0, true); baseQuals, baseQuals, baseQuals, baseQuals, true);
} }
@Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
@ -649,8 +626,8 @@ public class PairHMMUnitTest extends BaseTest {
byte[] refBases = "AAAT".getBytes(); byte[] refBases = "AAAT".getBytes();
byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length); byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length);
hmm.initialize(3, 2); hmm.initialize(2, 3);
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
baseQuals, baseQuals, baseQuals, baseQuals, 0, true); baseQuals, baseQuals, baseQuals, baseQuals, true);
} }
} }

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.pairhmm; package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
@ -34,15 +35,22 @@ import java.util.Arrays;
/** /**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* *
* User: rpoplin * User: rpoplin, carneiro
* Date: 3/1/12 * Date: 3/1/12
*/ */
public class Log10PairHMM extends PairHMM { public final class Log10PairHMM extends PairHMM {
/** /**
* Should we use exact log10 calculation (true), or an approximation (false)? * Should we use exact log10 calculation (true), or an approximation (false)?
*/ */
private final boolean doExactLog10; private final boolean doExactLog10;
private static final int matchToMatch = 0;
private static final int indelToMatch = 1;
private static final int matchToInsertion = 2;
private static final int insertionToInsertion = 3;
private static final int matchToDeletion = 4;
private static final int deletionToDeletion = 5;
/** /**
* Create an uninitialized PairHMM * Create an uninitialized PairHMM
* *
@ -64,14 +72,17 @@ public class Log10PairHMM extends PairHMM {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public void initialize( final int haplotypeMaxLength, final int readMaxLength) { public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
super.initialize(haplotypeMaxLength, readMaxLength); super.initialize(readMaxLength, haplotypeMaxLength);
for( int iii=0; iii < paddedMaxReadLength; iii++ ) { for( int iii=0; iii < paddedMaxReadLength; iii++ ) {
Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
} }
transition = new double[paddedMaxReadLength][6];
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
} }
/** /**
@ -86,37 +97,90 @@ public class Log10PairHMM extends PairHMM {
final byte[] overallGCP, final byte[] overallGCP,
final int hapStartIndex, final int hapStartIndex,
final boolean recacheReadValues ) { final boolean recacheReadValues ) {
// the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual
// read and haplotypes, not the maximum
final double initialValue = Math.log10((double) 1/haplotypeBases.length);
matchMatrix[1][1] = initialValue;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
final int X_METRIC_LENGTH = readBases.length + 2; // set the initial value (free deletions in the beginning) for the first row in the deletion matrix
final int Y_METRIC_LENGTH = haplotypeBases.length + 2; final double initialValue = Math.log10(1.0 / haplotypeBases.length);
for( int j = 0; j < paddedHaplotypeLength; j++ ) {
// ensure that all the qual scores have valid values deletionMatrix[0][j] = initialValue;
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMatrix, insertionMatrix, deletionMatrix);
} }
} }
// final probability is the log10 sum of the last element in all three state arrays if ( ! constantsAreInitialized || recacheReadValues )
final int endI = X_METRIC_LENGTH - 1; initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
for (int j = 2; j < Y_METRIC_LENGTH; j++)
result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]});
return result; for (int i = 1; i < paddedReadLength; i++) {
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) {
updateCell(i, j, prior[i][j], transition[i]);
} }
}
// final probability is the log10 sum of the last element in the Match and Insertion state arrays
// this way we ignore all paths that ended in deletions! (huge)
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
final int endI = paddedReadLength - 1;
double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]});
for (int j = 2; j < paddedHaplotypeLength; j++)
finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]});
return finalSumProbabilities;
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
@Requires({
"insertionGOP != null",
"deletionGOP != null",
"overallGCP != null"
})
@Ensures("constantsAreInitialized")
private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
for (int i = 0; i < insertionGOP.length; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]);
transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
}
// note that we initialized the constants
constantsAreInitialized = true;
}
/** /**
* Compute the log10SumLog10 of the values * Compute the log10SumLog10 of the values
@ -136,37 +200,24 @@ public class Log10PairHMM extends PairHMM {
return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values);
} }
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, /**
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, * Updates a cell in the HMM matrix
final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) { *
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions * @param indI row index in the matrices to update
final int im1 = indI - 1; * @param indJ column index in the matrices to update
final int jm1 = indJ - 1; * @param prior the likelihood editing distance matrix for the read x haplotype
* @param transition an array with the six transition relevant to this location
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
// update the match array matchMatrix[indI][indJ] = prior +
double pBaseReadLog10 = 0.0; // Math.log10(1.0); myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch],
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch],
final byte x = readBases[im1-1]; deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]});
final byte y = haplotypeBases[jm1-1]; insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]});
final byte qual = readQuals[im1-1]; deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]});
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0});
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1});
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]);
final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]);
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2});
} }
} }

View File

@ -40,9 +40,11 @@ import java.util.Arrays;
public abstract class PairHMM { public abstract class PairHMM {
protected final static Logger logger = Logger.getLogger(PairHMM.class); protected final static Logger logger = Logger.getLogger(PairHMM.class);
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; protected double[][] transition = null; // The transition probabilities cache
protected static final byte DEFAULT_GOP = (byte) 45; protected double[][] prior = null; // The prior probabilities cache
protected static final byte DEFAULT_GCP = (byte) 10; protected boolean constantsAreInitialized = false;
protected byte[] previousHaplotypeBases;
public enum HMM_IMPLEMENTATION { public enum HMM_IMPLEMENTATION {
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
@ -58,14 +60,18 @@ public abstract class PairHMM {
protected double[][] deletionMatrix = null; protected double[][] deletionMatrix = null;
protected int maxHaplotypeLength, maxReadLength; protected int maxHaplotypeLength, maxReadLength;
protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
protected int paddedReadLength, paddedHaplotypeLength;
private boolean initialized = false; private boolean initialized = false;
/** /**
* Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
*
* Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding.
*
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM * @param readMaxLength the max length of reads we want to use with this PairHMM
*/ */
public void initialize( final int haplotypeMaxLength, final int readMaxLength ) { public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);
@ -73,15 +79,21 @@ public abstract class PairHMM {
maxReadLength = readMaxLength; maxReadLength = readMaxLength;
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
paddedMaxReadLength = readMaxLength + 2; paddedMaxReadLength = readMaxLength + 1;
paddedMaxHaplotypeLength = haplotypeMaxLength + 2; paddedMaxHaplotypeLength = haplotypeMaxLength + 1;
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
previousHaplotypeBases = null;
constantsAreInitialized = false;
initialized = true; initialized = true;
} }
/** /**
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
* probabilities. * probabilities.
@ -98,8 +110,6 @@ public abstract class PairHMM {
* @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases
* @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases
* @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases
* @param hapStartIndex start the hmm calculation at this offset in haplotype bases. Used in the caching calculation
* where multiple haplotypes are used, and they only diff starting at hapStartIndex
* @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated
* parameters are the same, and only the haplotype bases are changing underneath us * parameters are the same, and only the haplotype bases are changing underneath us
* @return the log10 probability of read coming from the haplotype under the provided error model * @return the log10 probability of read coming from the haplotype under the provided error model
@ -110,7 +120,6 @@ public abstract class PairHMM {
final byte[] insertionGOP, final byte[] insertionGOP,
final byte[] deletionGOP, final byte[] deletionGOP,
final byte[] overallGCP, final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) { final boolean recacheReadValues ) {
if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
@ -121,14 +130,22 @@ public abstract class PairHMM {
if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length);
if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length);
if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length);
if ( hapStartIndex < 0 || hapStartIndex > haplotypeBases.length ) throw new IllegalArgumentException("hapStartIndex is bad, must be between 0 and haplotype length " + haplotypeBases.length + " but got " + hapStartIndex);
paddedReadLength = readBases.length + 1;
paddedHaplotypeLength = haplotypeBases.length + 1;
final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length ) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases);
double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues);
if ( MathUtils.goodLog10Probability(result) ) if ( ! MathUtils.goodLog10Probability(result) )
return result;
else
throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
// Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of).
// Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper.
previousHaplotypeBases = haplotypeBases;
return result;
} }
/** /**