Fix caching indices in the PairHMM
Problem: -------- PairHMM was generating positive likelihoods (even after the re-work of the model) Solution: --------- The caching idices were never re-initializing the initial conditions in the first position of the deletion matrix. Also the match matrix was being wrongly initialized (there is not necessarily a match in the first position). This commit fixes both issues on both the Logless and the Log10 versions of the PairHMM. Summarized Changes: ------------------ * Redesign the matrices to have only 1 col/row of padding instead of 2. * PairHMM class now owns the caching of the haplotype (keeps track of last haplotypes, and decides where the caching should start) * Initial condition (in the deletionMatrix) is now updated every time the haplotypes differ in length (this was wrong in the previous version) * Adjust the prior and probability matrices to be one based (logless) * Update Log10PairHMM to work with prior and probability matrices as well * Move prior and probability matrices to parent class * Move and rename padded lengths to parent class to simplify interface and prevent off by one errors in new implementations * Simple cleanup of PairHMMUnitTest class for a little speedup * Updated HC and UG integration test MD5's because of the new initialization (without enforcing match on first base). * Create static indices for the transition probabilities (for better readability) [fixes #47399227]
This commit is contained in:
parent
56f4529ef3
commit
ebe2edbef3
|
|
@ -106,12 +106,8 @@ public class LikelihoodCalculationEngine {
|
|||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH);
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
|
|
@ -134,7 +130,6 @@ public class LikelihoodCalculationEngine {
|
|||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
|
|
@ -149,14 +144,9 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
|
||||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
final boolean isFirstHaplotype = jjj == 0;
|
||||
final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
|
||||
read.getReadBases(), readQuals, readInsQuals, readDelQuals,
|
||||
overallGCP, haplotypeStart, isFirstHaplotype);
|
||||
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
|
||||
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -349,7 +349,6 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
int j=0;
|
||||
|
||||
byte[] previousHaplotypeSeen = null;
|
||||
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
|
||||
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
|
||||
|
||||
|
|
@ -392,34 +391,25 @@ public class PairHMMIndelErrorModel {
|
|||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||
(int)indStart, (int)indStop);
|
||||
|
||||
final int X_METRIC_LENGTH = readBases.length+2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
|
||||
|
||||
if (previousHaplotypeSeen == null) {
|
||||
if (firstHap) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
pairHMM.initialize(Y_METRIC_LENGTH, X_METRIC_LENGTH);
|
||||
pairHMM.initialize(readBases.length, haplotypeBases.length);
|
||||
firstHap = false;
|
||||
}
|
||||
|
||||
int startIndexInHaplotype = 0;
|
||||
if (previousHaplotypeSeen != null)
|
||||
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
baseInsertionQualities, baseDeletionQualities,
|
||||
contextLogGapContinuationProbabilities, startIndexInHaplotype, firstHap);
|
||||
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap);
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("H:"+new String(haplotypeBases));
|
||||
System.out.println("R:"+new String(readBases));
|
||||
System.out.format("L:%4.2f\n",readLikelihood);
|
||||
System.out.format("StPos:%d\n", startIndexInHaplotype);
|
||||
}
|
||||
|
||||
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
firstHap = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -59,16 +59,20 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
protected static final double SCALE_FACTOR_LOG10 = 300.0;
|
||||
protected static final double INITIAL_CONDITION = Math.pow(10, SCALE_FACTOR_LOG10);
|
||||
|
||||
double[][] transition = null; // The cache
|
||||
double[][] prior = null; // The cache
|
||||
boolean constantsAreInitialized = false;
|
||||
private static final int matchToMatch = 0;
|
||||
private static final int indelToMatch = 1;
|
||||
private static final int matchToInsertion = 2;
|
||||
private static final int insertionToInsertion = 3;
|
||||
private static final int matchToDeletion = 4;
|
||||
private static final int deletionToDeletion = 5;
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
|
||||
super.initialize(haplotypeMaxLength, readMaxLength);
|
||||
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
|
||||
super.initialize(readMaxLength, haplotypeMaxLength);
|
||||
|
||||
transition = new double[paddedMaxReadLength][6];
|
||||
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
|
|
@ -86,18 +90,22 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
|
||||
final double initialValue = INITIAL_CONDITION / haplotypeBases.length;
|
||||
// set the initial value (free deletions in the beginning) for the first row in the deletion matrix
|
||||
for( int j = 0; j < paddedHaplotypeLength; j++ ) {
|
||||
deletionMatrix[0][j] = initialValue;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! constantsAreInitialized || recacheReadValues )
|
||||
initializeProbabilities(haplotypeBases.length, insertionGOP, deletionGOP, overallGCP);
|
||||
initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
|
||||
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
|
||||
|
||||
// NOTE NOTE NOTE -- because of caching we need to only operate over X and Y according to this
|
||||
// read and haplotype lengths, not the max lengths
|
||||
final int readXMetricLength = readBases.length + 2;
|
||||
final int hapYMetricLength = haplotypeBases.length + 2;
|
||||
|
||||
for (int i = 2; i < readXMetricLength; i++) {
|
||||
for (int i = 1; i < paddedReadLength; i++) {
|
||||
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
|
||||
for (int j = hapStartIndex+1; j < hapYMetricLength; j++) {
|
||||
for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) {
|
||||
updateCell(i, j, prior[i][j], transition[i]);
|
||||
}
|
||||
}
|
||||
|
|
@ -105,9 +113,9 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
// final probability is the log10 sum of the last element in the Match and Insertion state arrays
|
||||
// this way we ignore all paths that ended in deletions! (huge)
|
||||
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
|
||||
final int endI = readXMetricLength - 1;
|
||||
final int endI = paddedReadLength - 1;
|
||||
double finalSumProbabilities = 0.0;
|
||||
for (int j = 0; j < hapYMetricLength; j++) {
|
||||
for (int j = 1; j < paddedHaplotypeLength; j++) {
|
||||
finalSumProbabilities += matchMatrix[endI][j] + insertionMatrix[endI][j];
|
||||
}
|
||||
return Math.log10(finalSumProbabilities) - SCALE_FACTOR_LOG10;
|
||||
|
|
@ -132,7 +140,7 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
prior[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
|
||||
}
|
||||
}
|
||||
|
|
@ -151,25 +159,15 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
"overallGCP != null"
|
||||
})
|
||||
@Ensures("constantsAreInitialized")
|
||||
private void initializeProbabilities(final int haplotypeLength, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
// the initial condition -- must be here because it needs that actual read and haplotypes, not the maximum in init
|
||||
final double initialValue = INITIAL_CONDITION / haplotypeLength;
|
||||
matchMatrix[1][1] = initialValue;
|
||||
|
||||
// fill in the first row
|
||||
for( int jjj = 2; jjj < paddedMaxHaplotypeLength; jjj++ ) {
|
||||
deletionMatrix[1][jjj] = initialValue;
|
||||
}
|
||||
|
||||
final int l = insertionGOP.length;
|
||||
for (int i = 0; i < l; i++) {
|
||||
private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
for (int i = 0; i < insertionGOP.length; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
transition[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
transition[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
|
||||
transition[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
|
||||
transition[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
transition[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
transition[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]);
|
||||
transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]);
|
||||
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
}
|
||||
|
||||
// note that we initialized the constants
|
||||
|
|
@ -185,14 +183,14 @@ public final class LoglessPairHMM extends PairHMM {
|
|||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param transitition an array with the six transitition relevant to this location
|
||||
* @param transition an array with the six transition relevant to this location
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] transitition) {
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
|
||||
|
||||
matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transitition[0] +
|
||||
insertionMatrix[indI - 1][indJ - 1] * transitition[1] +
|
||||
deletionMatrix[indI - 1][indJ - 1] * transitition[1] );
|
||||
insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transitition[2] + insertionMatrix[indI - 1][indJ] * transitition[3];
|
||||
deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transitition[4] + deletionMatrix[indI][indJ - 1] * transitition[5];
|
||||
matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] +
|
||||
insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] +
|
||||
deletionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] );
|
||||
insertionMatrix[indI][indJ] = matchMatrix[indI - 1][indJ] * transition[matchToInsertion] + insertionMatrix[indI - 1][indJ] * transition[insertionToInsertion];
|
||||
deletionMatrix[indI][indJ] = matchMatrix[indI][indJ - 1] * transition[matchToDeletion] + deletionMatrix[indI][indJ - 1] * transition[deletionToDeletion];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "faadc0b77a91a716dbb1191fd579d025");
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "603416111f34e2a735163fa97e1a8272");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","fe715b715526a7c1ebd575ff66bba716");
|
||||
executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","13de8558acaa0b9082f2df477b45de9b");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("51e022d07ead45a4e154f949b6642e84"));
|
||||
Arrays.asList("118ed5b54fc9ce1cde89f06a20afebef"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -87,7 +87,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("1d9c6fda344eeee76cbe4221251dc341"));
|
||||
Arrays.asList("6ef59013331bc031ea37807b325d7d2c"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -100,7 +100,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("2ec7262f0a3d04534ce1fe15cc79f52e"));
|
||||
Arrays.asList("dd3ee4675377191e34aaf67335e0219a"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -110,7 +110,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("3131cd7c49b623983a106db5228754b3"));
|
||||
Arrays.asList("bb06ef8262f91664b7d2fe7e1e5df195"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
|
||||
}
|
||||
|
||||
|
|
@ -120,7 +120,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
|
||||
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
|
||||
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("273f5daa936e93da98efd6ceb37d7533"));
|
||||
Arrays.asList("0a2a8cc2d1a79e84624836a31de5491c"));
|
||||
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
|
||||
}
|
||||
|
||||
|
|
@ -135,7 +135,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
|
||||
Arrays.asList("00a003a0908281384e981294434a9f3e"));
|
||||
Arrays.asList("939f80c6d2dfb592956aed3bdeaf319d"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -175,7 +175,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction0() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
|
||||
Arrays.asList("87521a1bde124c7c5908ed067060fe45"));
|
||||
Arrays.asList("fc937f92e59dfe07b894411b5dfc166a"));
|
||||
executeTest("test minIndelFraction 0.0", spec);
|
||||
}
|
||||
|
||||
|
|
@ -183,7 +183,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
public void testMinIndelFraction25() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
|
||||
Arrays.asList("8a880b8b1662e31e0b5c65733eac6b74"));
|
||||
Arrays.asList("41ad9e0edca4b9987390ba5c07f39e4a"));
|
||||
executeTest("test minIndelFraction 0.25", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("3a805f5b823ccac19aaec01a3016100e"));
|
||||
Arrays.asList("0a4a78da876bfa3d42170249a94357b4"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("25aa0259876692dc3c848a37369bac6a"));
|
||||
Arrays.asList("89182fd4d9532ab4b2a0a84bfb557089"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("3f8ee598c9b85aa1d2b85746ad46c1af"));
|
||||
Arrays.asList("52b6086f4597da5b35ab902bea4066fc"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("31c0f0074b3306b54170056e93b69e11"));
|
||||
Arrays.asList("28bfbff3da3af43d6a1eff673e5cb0f8"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -112,7 +112,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("753d6358b1634107de76900200116805"));
|
||||
Arrays.asList("a9edd04374ee9c42970291f39a50c191"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("274eadae8a630a3fda9281d6d6253dea"));
|
||||
Arrays.asList("6fc32ca9de769060f3c2a3d94f8f2f91"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "c5939a7f5f85ea2fe994ce912732e180");
|
||||
testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "73817d9173b8d9d05dac1f3092871f33");
|
||||
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7b67ac6213b7a6f759057fb9d7148fdc");
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
|
|
@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAComplex() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538",
|
||||
"8a110549543412fa682419e9a8f0dd1d");
|
||||
"eb41ed6f1d692368a0f67311d139a38a");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"5429c234d471434adc09d9e60b87de24");
|
||||
"c4c33c962aca12c51def9b8cde35b7d2");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,15 +47,12 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.TribbleIndexedFeatureReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -80,12 +77,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "008958c211a8a439a7213a96f3dd7f6c");
|
||||
HCTest(CEUTRIO_BAM, "", "f132843e3c8e065a783cc4fdf9ee5df3");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "3b60c6133eeadfea028dffea93b88478");
|
||||
HCTest(NA12878_BAM, "", "15e0201f5c478310d278d2d03483c152");
|
||||
}
|
||||
|
||||
@Test(enabled = false) // can't annotate the rsID's yet
|
||||
|
|
@ -96,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"70bd5d0805bf6f51e5f61b377526c979");
|
||||
"48d309aed0cdc40cc983eeb5a8d12f53");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -112,7 +109,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "34c7fcfe17a1d835e2dc403df9eb3591");
|
||||
}
|
||||
|
||||
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
||||
|
|
@ -149,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerNearbySmallIntervals() {
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7");
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "eae65d20836d6c6ebca9e25e33566f74");
|
||||
}
|
||||
|
||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -159,14 +156,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("35a8edeca7518835d67a10de21493eca"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a3d74040a4966bf7a04cbd4924970685"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c81d7e69dd4116890f06a71b19870300"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("40da88ed3722c512264b72db37f18720"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
|
|
@ -188,7 +185,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("f0a215faed194dc160f19e26293e85f8"));
|
||||
Arrays.asList("69b83d578c14ed32d08ce4e7ff8a8a18"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -196,7 +193,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("bea274584344fa6b4b0f98eee327bad8"));
|
||||
Arrays.asList("0cae60d86a3f86854699217a30ece3e3"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
|
|||
for ( final int nct : Arrays.asList(1, 2) ) {
|
||||
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
|
||||
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "9a1202d849653f0480932f450ec507b4", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct });
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -142,11 +142,11 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
|
||||
pairHMM.initialize(refBasesWithContext.length, readBasesWithContext.length);
|
||||
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
|
||||
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
|
||||
qualAsBytes(gcp, false, anchorIndel), 0, true);
|
||||
qualAsBytes(gcp, false, anchorIndel), true);
|
||||
}
|
||||
|
||||
private byte[] asBytes(final String bases, final boolean left, final boolean right) {
|
||||
|
|
@ -268,8 +268,8 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
|
||||
final double exactLogL = cfg.calcLogL( exactHMM, true );
|
||||
for ( final PairHMM hmm : getHMMs() ) {
|
||||
double actualLogL = cfg.calcLogL( hmm, true );
|
||||
double expectedLogL = cfg.expectedLogL();
|
||||
final double actualLogL = cfg.calcLogL( hmm, true );
|
||||
final double expectedLogL = cfg.expectedLogL();
|
||||
|
||||
// compare to our theoretical expectation with appropriate tolerance
|
||||
Assert.assertEquals(actualLogL, expectedLogL, cfg.toleranceFromTheoretical(), "Failed with hmm " + hmm);
|
||||
|
|
@ -283,10 +283,10 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
@Test(enabled = !DEBUG, dataProvider = "OptimizedLikelihoodTestProvider")
|
||||
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
if ( ALLOW_READS_LONGER_THAN_HAPLOTYPE || cfg.read.length() <= cfg.ref.length() ) {
|
||||
double exactLogL = cfg.calcLogL( exactHMM, false );
|
||||
final double exactLogL = cfg.calcLogL( exactHMM, false );
|
||||
|
||||
for ( final PairHMM hmm : getHMMs() ) {
|
||||
double calculatedLogL = cfg.calcLogL( hmm, false );
|
||||
final double calculatedLogL = cfg.calcLogL( hmm, false );
|
||||
// compare to the exact reference implementation with appropriate tolerance
|
||||
Assert.assertEquals(calculatedLogL, exactLogL, cfg.getTolerance(hmm), String.format("Test: logL calc=%.2f expected=%.2f for %s with hmm %s", calculatedLogL, exactLogL, cfg.toString(), hmm));
|
||||
Assert.assertTrue(MathUtils.goodLog10Probability(calculatedLogL), "Bad log10 likelihood " + calculatedLogL);
|
||||
|
|
@ -296,64 +296,55 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = !DEBUG)
|
||||
public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() {
|
||||
byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
|
||||
|
||||
final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
|
||||
final byte matchQual = 90;
|
||||
final byte mismatchQual = 20;
|
||||
final byte indelQual = 80;
|
||||
final int offset = 2;
|
||||
byte[] gop = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(gop, (byte) 80);
|
||||
byte[] gcp = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(gcp, (byte) 80);
|
||||
final byte[] gop = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(gop, indelQual);
|
||||
final byte[] gcp = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(gcp, indelQual);
|
||||
loglessHMM.initialize(gop.length, haplotype1.length);
|
||||
|
||||
for( int k = 0; k < haplotype1.length - 2 * offset; k++ ) {
|
||||
byte[] quals = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(quals, (byte) 90);
|
||||
// one read mismatches the haplotype
|
||||
quals[k] = 20;
|
||||
|
||||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
|
||||
final byte[] quals = new byte[haplotype1.length - 2 * offset];
|
||||
Arrays.fill(quals, matchQual);
|
||||
// one base mismatches the haplotype
|
||||
quals[k] = mismatchQual;
|
||||
final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
originalHMM.initialize(haplotype1.length, mread.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
|
||||
final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
|
||||
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
|
||||
Assert.assertEquals(res1, expected, 1e-2);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = ! DEBUG)
|
||||
public void testMismatchInEveryPositionInTheRead() {
|
||||
byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
|
||||
final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes();
|
||||
final byte matchQual = 90;
|
||||
final byte mismatchQual = 20;
|
||||
final byte indelQual = 80;
|
||||
|
||||
final int offset = 2;
|
||||
byte[] gop = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(gop, (byte) 80);
|
||||
byte[] gcp = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(gcp, (byte) 80);
|
||||
final byte[] gop = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(gop, indelQual);
|
||||
final byte[] gcp = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(gcp, indelQual);
|
||||
loglessHMM.initialize(gop.length, haplotype1.length);
|
||||
|
||||
for( int k = 0; k < haplotype1.length - offset; k++ ) {
|
||||
byte[] quals = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(quals, (byte) 90);
|
||||
// one read mismatches the haplotype
|
||||
quals[k] = 20;
|
||||
|
||||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
|
||||
final byte[] quals = new byte[haplotype1.length - offset];
|
||||
Arrays.fill(quals, matchQual);
|
||||
// one base mismatches the haplotype with low qual
|
||||
quals[k] = mismatchQual;
|
||||
final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
originalHMM.initialize(haplotype1.length, mread.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(90), mread.length-1) * QualityUtils.qualToErrorProb(20));
|
||||
final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false);
|
||||
final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual));
|
||||
Assert.assertEquals(res1, expected, 1e-2);
|
||||
}
|
||||
}
|
||||
|
|
@ -376,35 +367,35 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = !DEBUG, dataProvider = "HMMProvider")
|
||||
void testMultipleReadMatchesInHaplotype(final PairHMM hmm, final int readSize, final int refSize) {
|
||||
byte[] readBases = Utils.dupBytes((byte)'A', readSize);
|
||||
byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes();
|
||||
byte baseQual = 20;
|
||||
byte insQual = 37;
|
||||
byte delQual = 37;
|
||||
byte gcp = 10;
|
||||
hmm.initialize(refBases.length, readBases.length);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
final byte[] readBases = Utils.dupBytes((byte)'A', readSize);
|
||||
final byte[] refBases = ("CC" + new String(Utils.dupBytes((byte)'A', refSize)) + "GGA").getBytes();
|
||||
final byte baseQual = 20;
|
||||
final byte insQual = 37;
|
||||
final byte delQual = 37;
|
||||
final byte gcp = 10;
|
||||
hmm.initialize(readBases.length, refBases.length);
|
||||
final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
Utils.dupBytes(baseQual, readBases.length),
|
||||
Utils.dupBytes(insQual, readBases.length),
|
||||
Utils.dupBytes(delQual, readBases.length),
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
Utils.dupBytes(gcp, readBases.length), true);
|
||||
Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d);
|
||||
}
|
||||
|
||||
@Test(enabled = !DEBUG, dataProvider = "HMMProvider")
|
||||
void testAllMatchingRead(final PairHMM hmm, final int readSize, final int refSize) {
|
||||
byte[] readBases = Utils.dupBytes((byte)'A', readSize);
|
||||
byte[] refBases = Utils.dupBytes((byte)'A', refSize);
|
||||
byte baseQual = 20;
|
||||
byte insQual = 100;
|
||||
byte delQual = 100;
|
||||
byte gcp = 100;
|
||||
hmm.initialize(refBases.length, readBases.length);
|
||||
final byte[] readBases = Utils.dupBytes((byte)'A', readSize);
|
||||
final byte[] refBases = Utils.dupBytes((byte)'A', refSize);
|
||||
final byte baseQual = 20;
|
||||
final byte insQual = 100;
|
||||
final byte delQual = 100;
|
||||
final byte gcp = 100;
|
||||
hmm.initialize(readBases.length, refBases.length);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
Utils.dupBytes(baseQual, readBases.length),
|
||||
Utils.dupBytes(insQual, readBases.length),
|
||||
Utils.dupBytes(delQual, readBases.length),
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
Utils.dupBytes(gcp, readBases.length), true);
|
||||
double expected = 0;
|
||||
final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length;
|
||||
if (readBases.length < refBases.length) {
|
||||
|
|
@ -439,45 +430,42 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = !DEBUG, dataProvider = "HMMProviderWithBigReads")
|
||||
void testReallyBigReads(final PairHMM hmm, final String read, final String ref) {
|
||||
byte[] readBases = read.getBytes();
|
||||
byte[] refBases = ref.getBytes();
|
||||
byte baseQual = 30;
|
||||
byte insQual = 40;
|
||||
byte delQual = 40;
|
||||
byte gcp = 10;
|
||||
hmm.initialize(refBases.length, readBases.length);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
final byte[] readBases = read.getBytes();
|
||||
final byte[] refBases = ref.getBytes();
|
||||
final byte baseQual = 30;
|
||||
final byte insQual = 40;
|
||||
final byte delQual = 40;
|
||||
final byte gcp = 10;
|
||||
hmm.initialize(readBases.length, refBases.length);
|
||||
hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
Utils.dupBytes(baseQual, readBases.length),
|
||||
Utils.dupBytes(insQual, readBases.length),
|
||||
Utils.dupBytes(delQual, readBases.length),
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + read.length() + " bases and ref with " + ref.length() + " bases");
|
||||
Utils.dupBytes(gcp, readBases.length), true);
|
||||
}
|
||||
|
||||
@Test(enabled = !DEBUG)
|
||||
void testPreviousBadValue() {
|
||||
byte[] readBases = "A".getBytes();
|
||||
byte[] refBases = "AT".getBytes();
|
||||
byte baseQual = 30;
|
||||
byte insQual = 40;
|
||||
byte delQual = 40;
|
||||
byte gcp = 10;
|
||||
final byte[] readBases = "A".getBytes();
|
||||
final byte[] refBases = "AT".getBytes();
|
||||
final byte baseQual = 30;
|
||||
final byte insQual = 40;
|
||||
final byte delQual = 40;
|
||||
final byte gcp = 10;
|
||||
|
||||
exactHMM.initialize(refBases.length, readBases.length);
|
||||
double d = exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
exactHMM.initialize(readBases.length, refBases.length);
|
||||
exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
Utils.dupBytes(baseQual, readBases.length),
|
||||
Utils.dupBytes(insQual, readBases.length),
|
||||
Utils.dupBytes(delQual, readBases.length),
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
//exactHMM.dumpMatrices();
|
||||
Utils.dupBytes(gcp, readBases.length), true);
|
||||
|
||||
loglessHMM.initialize(refBases.length, readBases.length);
|
||||
double logless = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
loglessHMM.initialize(readBases.length, refBases.length);
|
||||
loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
Utils.dupBytes(baseQual, readBases.length),
|
||||
Utils.dupBytes(insQual, readBases.length),
|
||||
Utils.dupBytes(delQual, readBases.length),
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
// loglessHMM.dumpMatrices();
|
||||
Utils.dupBytes(gcp, readBases.length), true);
|
||||
}
|
||||
|
||||
@DataProvider(name = "JustHMMProvider")
|
||||
|
|
@ -493,25 +481,16 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = !DEBUG, dataProvider = "JustHMMProvider")
|
||||
void testMaxLengthsBiggerThanProvidedRead(final PairHMM hmm) {
|
||||
final byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes();
|
||||
final byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
|
||||
|
||||
final byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
|
||||
final byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
|
||||
final byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
|
||||
final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length);
|
||||
hmm.initialize(readBases.length + 100, refBases.length + 100);
|
||||
for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) {
|
||||
byte[] readBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAACA".getBytes();
|
||||
byte[] refBases = "CTATCTTAGTAAGCCCCCATACCTGCAAATTTCAGGATGTCTCCTCCAAAAATCAAAACTTCTGAGAAAAAAAAAAAAAATTAAATCAAACCCTGATTCCTTAAAGGTAGTAAAAAAACATCATTCTTTCTTAGTGGAATAGAAACTAGGTCAAAAGAACAGTGATTC".getBytes();
|
||||
byte gcp = 10;
|
||||
|
||||
byte[] quals = new byte[]{35,34,31,32,35,34,32,31,36,30,31,32,36,34,33,32,32,32,33,32,30,35,33,35,36,36,33,33,33,32,32,32,37,33,36,35,33,32,34,31,36,35,35,35,35,33,34,31,31,30,28,27,26,29,26,25,29,29};
|
||||
byte[] insQual = new byte[]{46,46,46,46,46,47,45,46,45,48,47,44,45,48,46,43,43,42,48,48,45,47,47,48,48,47,48,45,38,47,45,39,47,48,47,47,48,46,49,48,49,48,46,47,48,44,44,43,39,32,34,36,46,48,46,44,45,45};
|
||||
byte[] delQual = new byte[]{44,44,44,43,45,44,43,42,45,46,45,43,44,47,45,40,40,40,45,46,43,45,45,44,46,46,46,43,35,44,43,36,44,45,46,46,44,44,47,43,47,45,45,45,46,45,45,46,44,35,35,35,45,47,45,44,44,43};
|
||||
|
||||
final int maxHaplotypeLength = refBases.length + nExtraMaxSize;
|
||||
final int maxReadLength = readBases.length + nExtraMaxSize;
|
||||
|
||||
hmm.initialize(maxHaplotypeLength, maxReadLength);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
quals,
|
||||
insQual,
|
||||
delQual,
|
||||
Utils.dupBytes(gcp, readBases.length), 0, true);
|
||||
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d +" was bad for a read with " + readBases.length + " bases and ref with " + refBases.length + " bases");
|
||||
hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -551,7 +530,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
final int maxHaplotypeLength = prefix.length() + root1.length();
|
||||
|
||||
// the initialization occurs once, at the start of the evalution of reads
|
||||
hmm.initialize(maxHaplotypeLength, maxReadLength);
|
||||
hmm.initialize(maxReadLength, maxHaplotypeLength);
|
||||
|
||||
for ( int prefixStart = prefix.length(); prefixStart >= 0; prefixStart-- ) {
|
||||
final String myPrefix = prefix.substring(prefixStart, prefix.length());
|
||||
|
|
@ -574,9 +553,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length);
|
||||
final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length);
|
||||
final byte[] gcp = Utils.dupBytes((byte)10, readBases.length);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp,
|
||||
hapStart, recache);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache);
|
||||
Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart);
|
||||
return d;
|
||||
}
|
||||
|
|
@ -629,7 +606,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
// didn't call initialize => should exception out
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, true);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
|
||||
|
|
@ -640,7 +617,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
hmm.initialize(3, 3);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, true);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider")
|
||||
|
|
@ -649,8 +626,8 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
byte[] refBases = "AAAT".getBytes();
|
||||
byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length);
|
||||
|
||||
hmm.initialize(3, 2);
|
||||
hmm.initialize(2, 3);
|
||||
double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases,
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, 0, true);
|
||||
baseQuals, baseQuals, baseQuals, baseQuals, true);
|
||||
}
|
||||
}
|
||||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
|
@ -34,15 +35,22 @@ import java.util.Arrays;
|
|||
/**
|
||||
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
|
||||
*
|
||||
* User: rpoplin
|
||||
* User: rpoplin, carneiro
|
||||
* Date: 3/1/12
|
||||
*/
|
||||
public class Log10PairHMM extends PairHMM {
|
||||
public final class Log10PairHMM extends PairHMM {
|
||||
/**
|
||||
* Should we use exact log10 calculation (true), or an approximation (false)?
|
||||
*/
|
||||
private final boolean doExactLog10;
|
||||
|
||||
private static final int matchToMatch = 0;
|
||||
private static final int indelToMatch = 1;
|
||||
private static final int matchToInsertion = 2;
|
||||
private static final int insertionToInsertion = 3;
|
||||
private static final int matchToDeletion = 4;
|
||||
private static final int deletionToDeletion = 5;
|
||||
|
||||
/**
|
||||
* Create an uninitialized PairHMM
|
||||
*
|
||||
|
|
@ -64,14 +72,17 @@ public class Log10PairHMM extends PairHMM {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void initialize( final int haplotypeMaxLength, final int readMaxLength) {
|
||||
super.initialize(haplotypeMaxLength, readMaxLength);
|
||||
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
|
||||
super.initialize(readMaxLength, haplotypeMaxLength);
|
||||
|
||||
for( int iii=0; iii < paddedMaxReadLength; iii++ ) {
|
||||
Arrays.fill(matchMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
transition = new double[paddedMaxReadLength][6];
|
||||
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -86,38 +97,91 @@ public class Log10PairHMM extends PairHMM {
|
|||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
// the initial condition -- must be in subComputeReadLikelihoodGivenHaplotypeLog10 because it needs that actual
|
||||
// read and haplotypes, not the maximum
|
||||
final double initialValue = Math.log10((double) 1/haplotypeBases.length);
|
||||
matchMatrix[1][1] = initialValue;
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMatrix, insertionMatrix, deletionMatrix);
|
||||
if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
|
||||
// set the initial value (free deletions in the beginning) for the first row in the deletion matrix
|
||||
final double initialValue = Math.log10(1.0 / haplotypeBases.length);
|
||||
for( int j = 0; j < paddedHaplotypeLength; j++ ) {
|
||||
deletionMatrix[0][j] = initialValue;
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
double result = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]});
|
||||
for (int j = 2; j < Y_METRIC_LENGTH; j++)
|
||||
result = myLog10SumLog10(new double[]{result, matchMatrix[endI][j], insertionMatrix[endI][j]});
|
||||
if ( ! constantsAreInitialized || recacheReadValues )
|
||||
initializeProbabilities(insertionGOP, deletionGOP, overallGCP);
|
||||
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
|
||||
|
||||
return result;
|
||||
for (int i = 1; i < paddedReadLength; i++) {
|
||||
// +1 here is because hapStartIndex is 0-based, but our matrices are 1 based
|
||||
for (int j = hapStartIndex+1; j < paddedHaplotypeLength; j++) {
|
||||
updateCell(i, j, prior[i][j], transition[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in the Match and Insertion state arrays
|
||||
// this way we ignore all paths that ended in deletions! (huge)
|
||||
// but we have to sum all the paths ending in the M and I matrices, because they're no longer extended.
|
||||
final int endI = paddedReadLength - 1;
|
||||
double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]});
|
||||
for (int j = 2; j < paddedHaplotypeLength; j++)
|
||||
finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]});
|
||||
|
||||
return finalSumProbabilities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
@Requires({
|
||||
"insertionGOP != null",
|
||||
"deletionGOP != null",
|
||||
"overallGCP != null"
|
||||
})
|
||||
@Ensures("constantsAreInitialized")
|
||||
private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
for (int i = 0; i < insertionGOP.length; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
|
||||
transition[i+1][indelToMatch] = QualityUtils.qualToProbLog10(overallGCP[i]);
|
||||
transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
|
||||
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
|
||||
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
}
|
||||
|
||||
// note that we initialized the constants
|
||||
constantsAreInitialized = true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute the log10SumLog10 of the values
|
||||
*
|
||||
|
|
@ -136,37 +200,24 @@ public class Log10PairHMM extends PairHMM {
|
|||
return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values);
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMatrix, final double[][] insertionMatrix, final double[][] deletionMatrix ) {
|
||||
/**
|
||||
* Updates a cell in the HMM matrix
|
||||
*
|
||||
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
|
||||
* initial conditions
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param transition an array with the six transition relevant to this location
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMatrix[indI][indJ] = pBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + d0, insertionMatrix[indI - 1][indJ - 1] + e0, deletionMatrix[indI - 1][indJ - 1] + e0});
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
insertionMatrix[indI][indJ] = qBaseReadLog10 + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ] + d1, insertionMatrix[indI - 1][indJ] + e1});
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]);
|
||||
final double e2 = ( im1 == 0 ) ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]);
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
deletionMatrix[indI][indJ] = qBaseRefLog10 + myLog10SumLog10(new double[]{matchMatrix[indI][indJ - 1] + d2, deletionMatrix[indI][indJ - 1] + e2});
|
||||
matchMatrix[indI][indJ] = prior +
|
||||
myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch],
|
||||
insertionMatrix[indI - 1][indJ - 1] + transition[indelToMatch],
|
||||
deletionMatrix[indI - 1][indJ - 1] + transition[indelToMatch]});
|
||||
insertionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI - 1][indJ] + transition[matchToInsertion], insertionMatrix[indI - 1][indJ] + transition[insertionToInsertion]});
|
||||
deletionMatrix[indI][indJ] = myLog10SumLog10(new double[] {matchMatrix[indI][indJ - 1] + transition[matchToDeletion], deletionMatrix[indI][indJ - 1] + transition[deletionToDeletion]});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,9 +40,11 @@ import java.util.Arrays;
|
|||
public abstract class PairHMM {
|
||||
protected final static Logger logger = Logger.getLogger(PairHMM.class);
|
||||
|
||||
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
|
||||
protected static final byte DEFAULT_GOP = (byte) 45;
|
||||
protected static final byte DEFAULT_GCP = (byte) 10;
|
||||
protected double[][] transition = null; // The transition probabilities cache
|
||||
protected double[][] prior = null; // The prior probabilities cache
|
||||
protected boolean constantsAreInitialized = false;
|
||||
|
||||
protected byte[] previousHaplotypeBases;
|
||||
|
||||
public enum HMM_IMPLEMENTATION {
|
||||
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
|
||||
|
|
@ -58,14 +60,18 @@ public abstract class PairHMM {
|
|||
protected double[][] deletionMatrix = null;
|
||||
protected int maxHaplotypeLength, maxReadLength;
|
||||
protected int paddedMaxReadLength, paddedMaxHaplotypeLength;
|
||||
protected int paddedReadLength, paddedHaplotypeLength;
|
||||
private boolean initialized = false;
|
||||
|
||||
/**
|
||||
* Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
|
||||
*
|
||||
* Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding.
|
||||
*
|
||||
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
|
||||
* @param readMaxLength the max length of reads we want to use with this PairHMM
|
||||
*/
|
||||
public void initialize( final int haplotypeMaxLength, final int readMaxLength ) {
|
||||
public void initialize( final int readMaxLength, final int haplotypeMaxLength ) {
|
||||
if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength);
|
||||
if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength);
|
||||
|
||||
|
|
@ -73,15 +79,21 @@ public abstract class PairHMM {
|
|||
maxReadLength = readMaxLength;
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
paddedMaxReadLength = readMaxLength + 2;
|
||||
paddedMaxHaplotypeLength = haplotypeMaxLength + 2;
|
||||
paddedMaxReadLength = readMaxLength + 1;
|
||||
paddedMaxHaplotypeLength = haplotypeMaxLength + 1;
|
||||
|
||||
matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
|
||||
previousHaplotypeBases = null;
|
||||
|
||||
constantsAreInitialized = false;
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
|
||||
* probabilities.
|
||||
|
|
@ -98,8 +110,6 @@ public abstract class PairHMM {
|
|||
* @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases
|
||||
* @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases
|
||||
* @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases
|
||||
* @param hapStartIndex start the hmm calculation at this offset in haplotype bases. Used in the caching calculation
|
||||
* where multiple haplotypes are used, and they only diff starting at hapStartIndex
|
||||
* @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated
|
||||
* parameters are the same, and only the haplotype bases are changing underneath us
|
||||
* @return the log10 probability of read coming from the haplotype under the provided error model
|
||||
|
|
@ -110,7 +120,6 @@ public abstract class PairHMM {
|
|||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10");
|
||||
if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null");
|
||||
|
|
@ -121,14 +130,22 @@ public abstract class PairHMM {
|
|||
if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length);
|
||||
if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length);
|
||||
if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length);
|
||||
if ( hapStartIndex < 0 || hapStartIndex > haplotypeBases.length ) throw new IllegalArgumentException("hapStartIndex is bad, must be between 0 and haplotype length " + haplotypeBases.length + " but got " + hapStartIndex);
|
||||
|
||||
paddedReadLength = readBases.length + 1;
|
||||
paddedHaplotypeLength = haplotypeBases.length + 1;
|
||||
|
||||
final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length ) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases);
|
||||
|
||||
double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues);
|
||||
|
||||
if ( MathUtils.goodLog10Probability(result) )
|
||||
return result;
|
||||
else
|
||||
if ( ! MathUtils.goodLog10Probability(result) )
|
||||
throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
|
||||
|
||||
// Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of).
|
||||
// Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper.
|
||||
previousHaplotypeBases = haplotypeBases;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue