gatk-3.8/public/java/src/org/broadinstitute/sting/utils/PairHMM.java

259 lines
15 KiB
Java

/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.*;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* User: rpoplin
* Date: 3/1/12
*/
public class PairHMM {
private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
private static final byte DEFAULT_GOP = (byte) 45;
private static final byte DEFAULT_GCP = (byte) 10;
private static final double BANDING_TOLERANCE = 22.0;
private static final int BANDING_CLUSTER_WINDOW = 12;
private final boolean noBanded;
public PairHMM() {
noBanded = false;
}
public PairHMM( final boolean noBanded ) {
this.noBanded = noBanded;
}
public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray,
final int X_METRIC_LENGTH) {
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
}
// the initial condition
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
if( false ) {
final ArrayList<Integer> workQueue = new ArrayList<Integer>(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
final ArrayList<Integer> workToBeAdded = new ArrayList<Integer>();
final ArrayList<Double> calculatedValues = new ArrayList<Double>();
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1;
workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
int el = 1;
for( int work : workQueue ) {
// choose the appropriate diagonal baseline location
int iii = 0;
int jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work;
jjj -= work;
while( iii >= X_METRIC_LENGTH || jjj <= 0 ) {
iii--;
jjj++;
work--;
}
if( !detectClusteredStartLocations(workToBeAdded, work ) ) {
workToBeAdded.add(work); // keep this thread going once it has started
}
if( work >= el - 3 ) {
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
double maxElement = Double.NEGATIVE_INFINITY;
for( el = work; el < numDiags + 1; el++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
calculatedValues.add(bestMetric);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
break;
}
if( --jjj <= 0 ) { // don't walk off the edge of the matrix
break;
}
}
// find a local maximum to start a new band in the work queue
double localMaxElement = Double.NEGATIVE_INFINITY;
int localMaxElementIndex = 0;
for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) {
final double bestMetric = calculatedValues.get(kkk);
if( bestMetric > localMaxElement ) {
localMaxElement = bestMetric;
localMaxElementIndex = kkk;
} else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) {
workToBeAdded.add( work + localMaxElementIndex );
}
break;
}
}
calculatedValues.clear();
// reset iii and jjj to the appropriate diagonal baseline location
iii = 0;
jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work-1;
jjj -= work-1;
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
}
}
}
workQueue.clear();
workQueue.addAll(workToBeAdded);
workToBeAdded.clear();
}
} else {
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
}
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
private boolean detectClusteredStartLocations( final ArrayList<Integer> list, int loc ) {
for(int x : list) {
if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) {
return true;
}
}
return false;
}
}