/* * Copyright (c) 2012, The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import java.util.*; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * User: rpoplin * Date: 3/1/12 */ public class PairHMM { private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; private static final byte DEFAULT_GOP = (byte) 45; private static final byte DEFAULT_GCP = (byte) 10; private static final double BANDING_TOLERANCE = 22.0; private static final int BANDING_CLUSTER_WINDOW = 12; private final boolean noBanded; public PairHMM() { noBanded = false; } public PairHMM( final boolean noBanded ) { this.noBanded = noBanded; } public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, final int X_METRIC_LENGTH) { for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); } // the initial condition matchMetricArray[1][1] = 0.0; // Math.log10(1.0); } @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment final int X_METRIC_LENGTH = readBases.length + 2; final int Y_METRIC_LENGTH = haplotypeBases.length + 2; // initial arrays to hold the probabilities of being in the match, insertion and deletion cases final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); } @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment final int X_METRIC_LENGTH = readBases.length + 2; final int Y_METRIC_LENGTH = haplotypeBases.length + 2; // ensure that all the qual scores have valid values for( int iii = 0; iii < readQuals.length; iii++ ) { readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); } if( false ) { final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step final ArrayList workToBeAdded = new ArrayList(); final ArrayList calculatedValues = new ArrayList(); final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order int el = 1; for( int work : workQueue ) { // choose the appropriate diagonal baseline location int iii = 0; int jjj = diag; if( diag > Y_METRIC_LENGTH ) { iii = diag - Y_METRIC_LENGTH; jjj = Y_METRIC_LENGTH; } // move to the starting work location along the diagonal iii += work; jjj -= work; while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { iii--; jjj++; work--; } if( !detectClusteredStartLocations(workToBeAdded, work ) ) { workToBeAdded.add(work); // keep this thread going once it has started } if( work >= el - 3 ) { // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value double maxElement = Double.NEGATIVE_INFINITY; for( el = work; el < numDiags + 1; el++ ) { updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); calculatedValues.add(bestMetric); if( bestMetric > maxElement ) { maxElement = bestMetric; } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { break; } if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix break; } if( --jjj <= 0 ) { // don't walk off the edge of the matrix break; } } // find a local maximum to start a new band in the work queue double localMaxElement = Double.NEGATIVE_INFINITY; int localMaxElementIndex = 0; for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { final double bestMetric = calculatedValues.get(kkk); if( bestMetric > localMaxElement ) { localMaxElement = bestMetric; localMaxElementIndex = kkk; } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { workToBeAdded.add( work + localMaxElementIndex ); } break; } } calculatedValues.clear(); // reset iii and jjj to the appropriate diagonal baseline location iii = 0; jjj = diag; if( diag > Y_METRIC_LENGTH ) { iii = diag - Y_METRIC_LENGTH; jjj = Y_METRIC_LENGTH; } // move to the starting work location along the diagonal iii += work-1; jjj -= work-1; // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); if( bestMetric > maxElement ) { maxElement = bestMetric; } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { break; } } } } workQueue.clear(); workQueue.addAll(workToBeAdded); workToBeAdded.clear(); } } else { // simple rectangular version of update loop, slow for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { if( (iii == 1 && jjj == 1) ) { continue; } updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); } } } // final probability is the log10 sum of the last element in all three state arrays final int endI = X_METRIC_LENGTH - 1; final int endJ = Y_METRIC_LENGTH - 1; return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); } private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions final int im1 = indI - 1; final int jm1 = indJ - 1; // update the match array double pBaseReadLog10 = 0.0; // Math.log10(1.0); if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state final byte x = readBases[im1-1]; final byte y = haplotypeBases[jm1-1]; final byte qual = readQuals[im1-1]; pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); } final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); // update the X (insertion) array final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); } // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { for(int x : list) { if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { return true; } } return false; } }