2012-04-12 01:56:51 +08:00
/ *
* Copyright ( c ) 2012 , The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE .
* /
package org.broadinstitute.sting.utils ;
import com.google.java.contract.Ensures ;
import com.google.java.contract.Requires ;
import java.util.* ;
/ * *
* Util class for performing the pair HMM for local alignment . Figure 4.3 in Durbin 1998 book .
* User : rpoplin
* Date : 3 / 1 / 12
* /
public class PairHMM {
2012-08-10 04:22:08 +08:00
private static final int MAX_CACHED_QUAL = ( int ) Byte . MAX_VALUE - 1 ;
2012-04-12 01:56:51 +08:00
private static final byte DEFAULT_GOP = ( byte ) 45 ;
private static final byte DEFAULT_GCP = ( byte ) 10 ;
private static final double BANDING_TOLERANCE = 22.0 ;
private static final int BANDING_CLUSTER_WINDOW = 12 ;
2012-04-19 04:35:09 +08:00
private final boolean noBanded ;
2012-04-12 01:56:51 +08:00
public PairHMM ( ) {
2012-04-19 04:35:09 +08:00
noBanded = false ;
2012-04-12 01:56:51 +08:00
}
2012-04-19 04:35:09 +08:00
public PairHMM ( final boolean noBanded ) {
this . noBanded = noBanded ;
2012-04-12 01:56:51 +08:00
}
2012-04-18 02:22:48 +08:00
public static void initializeArrays ( final double [ ] [ ] matchMetricArray , final double [ ] [ ] XMetricArray , final double [ ] [ ] YMetricArray ,
2012-04-12 01:56:51 +08:00
final int X_METRIC_LENGTH ) {
for ( int iii = 0 ; iii < X_METRIC_LENGTH ; iii + + ) {
Arrays . fill ( matchMetricArray [ iii ] , Double . NEGATIVE_INFINITY ) ;
Arrays . fill ( XMetricArray [ iii ] , Double . NEGATIVE_INFINITY ) ;
Arrays . fill ( YMetricArray [ iii ] , Double . NEGATIVE_INFINITY ) ;
}
// the initial condition
matchMetricArray [ 1 ] [ 1 ] = 0.0 ; // Math.log10(1.0);
}
@Requires ( { "readBases.length == readQuals.length" , "readBases.length == insertionGOP.length" , "readBases.length == deletionGOP.length" , "readBases.length == overallGCP.length" } )
@Ensures ( { "!Double.isInfinite(result)" , "!Double.isNaN(result)" } ) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype ( final byte [ ] haplotypeBases , final byte [ ] readBases , final byte [ ] readQuals ,
final byte [ ] insertionGOP , final byte [ ] deletionGOP , final byte [ ] overallGCP ) {
2012-04-19 04:25:03 +08:00
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases . length + 2 ;
final int Y_METRIC_LENGTH = haplotypeBases . length + 2 ;
2012-04-12 01:56:51 +08:00
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double [ ] [ ] matchMetricArray = new double [ X_METRIC_LENGTH ] [ Y_METRIC_LENGTH ] ;
final double [ ] [ ] XMetricArray = new double [ X_METRIC_LENGTH ] [ Y_METRIC_LENGTH ] ;
final double [ ] [ ] YMetricArray = new double [ X_METRIC_LENGTH ] [ Y_METRIC_LENGTH ] ;
initializeArrays ( matchMetricArray , XMetricArray , YMetricArray , X_METRIC_LENGTH ) ;
return computeReadLikelihoodGivenHaplotype ( haplotypeBases , readBases , readQuals , insertionGOP , deletionGOP , overallGCP , 0 , matchMetricArray , XMetricArray , YMetricArray ) ;
}
@Requires ( { "readBases.length == readQuals.length" , "readBases.length == insertionGOP.length" , "readBases.length == deletionGOP.length" , "readBases.length == overallGCP.length" } )
@Ensures ( { "!Double.isInfinite(result)" , "!Double.isNaN(result)" } ) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype ( final byte [ ] haplotypeBases , final byte [ ] readBases , final byte [ ] readQuals ,
final byte [ ] insertionGOP , final byte [ ] deletionGOP , final byte [ ] overallGCP , final int hapStartIndex ,
final double [ ] [ ] matchMetricArray , final double [ ] [ ] XMetricArray , final double [ ] [ ] YMetricArray ) {
2012-04-19 04:25:03 +08:00
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases . length + 2 ;
final int Y_METRIC_LENGTH = haplotypeBases . length + 2 ;
// ensure that all the qual scores have valid values
for ( int iii = 0 ; iii < readQuals . length ; iii + + ) {
readQuals [ iii ] = ( readQuals [ iii ] < QualityUtils . MIN_USABLE_Q_SCORE ? QualityUtils . MIN_USABLE_Q_SCORE : ( readQuals [ iii ] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals [ iii ] ) ) ;
}
2012-04-12 01:56:51 +08:00
2012-04-19 04:35:09 +08:00
if ( false ) {
2012-04-12 01:56:51 +08:00
final ArrayList < Integer > workQueue = new ArrayList < Integer > ( ) ; // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
final ArrayList < Integer > workToBeAdded = new ArrayList < Integer > ( ) ;
final ArrayList < Double > calculatedValues = new ArrayList < Double > ( ) ;
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1 ;
workQueue . add ( 1 ) ; // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
for ( int diag = 3 ; diag < numDiags ; diag + + ) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
int el = 1 ;
for ( int work : workQueue ) {
// choose the appropriate diagonal baseline location
int iii = 0 ;
int jjj = diag ;
if ( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH ;
jjj = Y_METRIC_LENGTH ;
}
// move to the starting work location along the diagonal
iii + = work ;
jjj - = work ;
while ( iii > = X_METRIC_LENGTH | | jjj < = 0 ) {
iii - - ;
jjj + + ;
work - - ;
}
if ( ! detectClusteredStartLocations ( workToBeAdded , work ) ) {
workToBeAdded . add ( work ) ; // keep this thread going once it has started
}
if ( work > = el - 3 ) {
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
double maxElement = Double . NEGATIVE_INFINITY ;
for ( el = work ; el < numDiags + 1 ; el + + ) {
updateCell ( iii , jjj , haplotypeBases , readBases , readQuals ,
insertionGOP , deletionGOP , overallGCP , matchMetricArray , XMetricArray , YMetricArray ) ;
final double bestMetric = MathUtils . max ( matchMetricArray [ iii ] [ jjj ] , XMetricArray [ iii ] [ jjj ] , YMetricArray [ iii ] [ jjj ] ) ;
calculatedValues . add ( bestMetric ) ;
if ( bestMetric > maxElement ) {
maxElement = bestMetric ;
} else if ( maxElement - bestMetric > BANDING_TOLERANCE ) {
break ;
}
if ( + + iii > = X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
break ;
}
if ( - - jjj < = 0 ) { // don't walk off the edge of the matrix
break ;
}
}
// find a local maximum to start a new band in the work queue
double localMaxElement = Double . NEGATIVE_INFINITY ;
int localMaxElementIndex = 0 ;
for ( int kkk = calculatedValues . size ( ) - 1 ; kkk > = 1 ; kkk - - ) {
final double bestMetric = calculatedValues . get ( kkk ) ;
if ( bestMetric > localMaxElement ) {
localMaxElement = bestMetric ;
localMaxElementIndex = kkk ;
} else if ( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
if ( ! detectClusteredStartLocations ( workToBeAdded , work + localMaxElementIndex ) ) {
workToBeAdded . add ( work + localMaxElementIndex ) ;
}
break ;
}
}
calculatedValues . clear ( ) ;
// reset iii and jjj to the appropriate diagonal baseline location
iii = 0 ;
jjj = diag ;
if ( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH ;
jjj = Y_METRIC_LENGTH ;
}
// move to the starting work location along the diagonal
iii + = work - 1 ;
jjj - = work - 1 ;
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
for ( int traceBack = work - 1 ; traceBack > 0 & & iii > 0 & & jjj < Y_METRIC_LENGTH ; traceBack - - , iii - - , jjj + + ) {
updateCell ( iii , jjj , haplotypeBases , readBases , readQuals ,
insertionGOP , deletionGOP , overallGCP , matchMetricArray , XMetricArray , YMetricArray ) ;
final double bestMetric = MathUtils . max ( matchMetricArray [ iii ] [ jjj ] , XMetricArray [ iii ] [ jjj ] , YMetricArray [ iii ] [ jjj ] ) ;
if ( bestMetric > maxElement ) {
maxElement = bestMetric ;
} else if ( maxElement - bestMetric > BANDING_TOLERANCE ) {
break ;
}
}
}
}
workQueue . clear ( ) ;
workQueue . addAll ( workToBeAdded ) ;
workToBeAdded . clear ( ) ;
}
} else {
// simple rectangular version of update loop, slow
for ( int iii = 1 ; iii < X_METRIC_LENGTH ; iii + + ) {
for ( int jjj = hapStartIndex + 1 ; jjj < Y_METRIC_LENGTH ; jjj + + ) {
if ( ( iii = = 1 & & jjj = = 1 ) ) { continue ; }
updateCell ( iii , jjj , haplotypeBases , readBases , readQuals , insertionGOP , deletionGOP , overallGCP ,
matchMetricArray , XMetricArray , YMetricArray ) ;
}
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1 ;
final int endJ = Y_METRIC_LENGTH - 1 ;
2012-04-19 04:25:03 +08:00
return MathUtils . approximateLog10SumLog10 ( matchMetricArray [ endI ] [ endJ ] , XMetricArray [ endI ] [ endJ ] , YMetricArray [ endI ] [ endJ ] ) ;
2012-04-12 01:56:51 +08:00
}
private void updateCell ( final int indI , final int indJ , final byte [ ] haplotypeBases , final byte [ ] readBases ,
final byte [ ] readQuals , final byte [ ] insertionGOP , final byte [ ] deletionGOP , final byte [ ] overallGCP ,
final double [ ] [ ] matchMetricArray , final double [ ] [ ] XMetricArray , final double [ ] [ ] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1 ;
final int jm1 = indJ - 1 ;
// update the match array
double pBaseReadLog10 = 0.0 ; // Math.log10(1.0);
if ( im1 > 0 & & jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases [ im1 - 1 ] ;
final byte y = haplotypeBases [ jm1 - 1 ] ;
2012-04-19 04:25:03 +08:00
final byte qual = readQuals [ im1 - 1 ] ;
2012-04-12 01:56:51 +08:00
pBaseReadLog10 = ( x = = y | | x = = ( byte ) 'N' | | y = = ( byte ) 'N' ? QualityUtils . qualToProbLog10 ( qual ) : QualityUtils . qualToErrorProbLog10 ( qual ) ) ;
}
final int qualIndexGOP = ( im1 = = 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP [ im1 - 1 ] + deletionGOP [ im1 - 1 ] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP [ im1 - 1 ] + deletionGOP [ im1 - 1 ] ) ) ;
final double d0 = QualityUtils . qualToProbLog10 ( ( byte ) qualIndexGOP ) ;
final double e0 = ( im1 = = 0 ? QualityUtils . qualToProbLog10 ( DEFAULT_GCP ) : QualityUtils . qualToProbLog10 ( overallGCP [ im1 - 1 ] ) ) ;
2012-04-19 04:25:03 +08:00
matchMetricArray [ indI ] [ indJ ] = pBaseReadLog10 + MathUtils . approximateLog10SumLog10 ( matchMetricArray [ indI - 1 ] [ indJ - 1 ] + d0 , XMetricArray [ indI - 1 ] [ indJ - 1 ] + e0 , YMetricArray [ indI - 1 ] [ indJ - 1 ] + e0 ) ;
2012-04-12 01:56:51 +08:00
// update the X (insertion) array
final double d1 = ( im1 = = 0 ? QualityUtils . qualToErrorProbLog10 ( DEFAULT_GOP ) : QualityUtils . qualToErrorProbLog10 ( insertionGOP [ im1 - 1 ] ) ) ;
final double e1 = ( im1 = = 0 ? QualityUtils . qualToErrorProbLog10 ( DEFAULT_GCP ) : QualityUtils . qualToErrorProbLog10 ( overallGCP [ im1 - 1 ] ) ) ;
final double qBaseReadLog10 = 0.0 ; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray [ indI ] [ indJ ] = qBaseReadLog10 + MathUtils . approximateLog10SumLog10 ( matchMetricArray [ indI - 1 ] [ indJ ] + d1 , XMetricArray [ indI - 1 ] [ indJ ] + e1 ) ;
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
2012-04-19 04:25:03 +08:00
final double d2 = ( im1 = = 0 | | im1 = = readBases . length ? 0.0 : QualityUtils . qualToErrorProbLog10 ( deletionGOP [ im1 - 1 ] ) ) ;
final double e2 = ( im1 = = 0 | | im1 = = readBases . length ? 0.0 : QualityUtils . qualToErrorProbLog10 ( overallGCP [ im1 - 1 ] ) ) ;
2012-04-12 01:56:51 +08:00
final double qBaseRefLog10 = 0.0 ; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray [ indI ] [ indJ ] = qBaseRefLog10 + MathUtils . approximateLog10SumLog10 ( matchMetricArray [ indI ] [ indJ - 1 ] + d2 , YMetricArray [ indI ] [ indJ - 1 ] + e2 ) ;
}
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
private boolean detectClusteredStartLocations ( final ArrayList < Integer > list , int loc ) {
for ( int x : list ) {
if ( Math . abs ( x - loc ) < = BANDING_CLUSTER_WINDOW ) {
return true ;
}
}
return false ;
}
}