2017-06-24 01:44:45 +08:00
# include <assert.h>
2017-06-24 06:57:00 +08:00
# include <string.h>
2017-09-19 07:49:15 +08:00
# include <stdlib.h>
2017-12-11 10:52:07 +08:00
# include <math.h>
2017-06-24 01:44:45 +08:00
# include "minimap.h"
2017-06-25 10:51:31 +08:00
# include "mmpriv.h"
2017-06-24 01:44:45 +08:00
# include "ksw2.h"
2017-06-24 06:57:00 +08:00
static void ksw_gen_simple_mat ( int m , int8_t * mat , int8_t a , int8_t b )
{
int i , j ;
a = a < 0 ? - a : a ;
b = b > 0 ? - b : b ;
for ( i = 0 ; i < m - 1 ; + + i ) {
for ( j = 0 ; j < m - 1 ; + + j )
mat [ i * m + j ] = i = = j ? a : b ;
mat [ i * m + m - 1 ] = 0 ;
}
for ( j = 0 ; j < m ; + + j )
mat [ ( m - 1 ) * m + j ] = 0 ;
}
2017-06-24 06:25:47 +08:00
static inline void mm_seq_rev ( uint32_t len , uint8_t * seq )
{
uint32_t i ;
uint8_t t ;
for ( i = 0 ; i < len > > 1 ; + + i )
t = seq [ i ] , seq [ i ] = seq [ len - 1 - i ] , seq [ len - 1 - i ] = t ;
}
2017-06-27 23:25:39 +08:00
static inline int test_zdrop_aux ( int32_t score , int i , int j , int32_t * max , int * max_i , int * max_j , int e , int zdrop )
{
if ( score < * max ) {
int li = i - * max_i ;
int lj = j - * max_j ;
int diff = li > lj ? li - lj : lj - li ;
if ( * max - score > zdrop + diff * e )
return 1 ;
} else * max = score , * max_i = i , * max_j = j ;
return 0 ;
}
static int mm_check_zdrop ( const uint8_t * qseq , const uint8_t * tseq , uint32_t n_cigar , uint32_t * cigar , const int8_t * mat , int8_t q , int8_t e , int zdrop )
{
uint32_t k ;
int32_t score = 0 , max = 0 , max_i = - 1 , max_j = - 1 , i = 0 , j = 0 ;
for ( k = 0 ; k < n_cigar ; + + k ) {
uint32_t l , op = cigar [ k ] & 0xf , len = cigar [ k ] > > 4 ;
if ( op = = 0 ) {
for ( l = 0 ; l < len ; + + l ) {
score + = mat [ tseq [ i + l ] * 5 + qseq [ j + l ] ] ;
if ( test_zdrop_aux ( score , i + l , j + l , & max , & max_i , & max_j , e , zdrop ) ) return 1 ;
}
i + = len , j + = len ;
} else if ( op = = 1 ) {
score - = q + e * len , j + = len ;
if ( test_zdrop_aux ( score , i , j , & max , & max_i , & max_j , e , zdrop ) ) return 1 ;
2017-08-18 15:31:15 +08:00
} else if ( op = = 2 | | op = = 3 ) {
2017-06-27 23:25:39 +08:00
score - = q + e * len , i + = len ;
if ( test_zdrop_aux ( score , i , j , & max , & max_i , & max_j , e , zdrop ) ) return 1 ;
}
}
return 0 ;
}
2017-10-10 23:59:44 +08:00
static void mm_fix_cigar ( mm_reg1_t * r , const uint8_t * qseq , const uint8_t * tseq , int * qshift , int * tshift )
{
mm_extra_t * p = r - > p ;
int32_t k , toff = 0 , qoff = 0 , to_shrink = 0 ;
* qshift = * tshift = 0 ;
if ( p - > n_cigar < = 1 ) return ;
for ( k = 0 ; k < p - > n_cigar ; + + k ) { // indel left alignment
uint32_t op = p - > cigar [ k ] & 0xf , len = p - > cigar [ k ] > > 4 ;
if ( len = = 0 ) to_shrink = 1 ;
if ( op = = 0 ) {
toff + = len , qoff + = len ;
} else if ( op = = 1 | | op = = 2 ) { // insertion or deletion
if ( k > 0 & & k < p - > n_cigar - 1 & & ( p - > cigar [ k - 1 ] & 0xf ) = = 0 & & ( p - > cigar [ k + 1 ] & 0xf ) = = 0 ) {
int l , prev_len = p - > cigar [ k - 1 ] > > 4 ;
if ( op = = 1 ) {
for ( l = 0 ; l < prev_len ; + + l )
if ( qseq [ qoff - 1 - l ] ! = qseq [ qoff + len - 1 - l ] )
break ;
} else {
for ( l = 0 ; l < prev_len ; + + l )
if ( tseq [ toff - 1 - l ] ! = tseq [ toff + len - 1 - l ] )
break ;
}
if ( l > 0 )
p - > cigar [ k - 1 ] - = l < < 4 , p - > cigar [ k + 1 ] + = l < < 4 , qoff - = l , toff - = l ;
if ( l = = prev_len ) to_shrink = 1 ;
}
if ( op = = 1 ) qoff + = len ;
else toff + = len ;
} else if ( op = = 3 ) {
toff + = len ;
}
}
assert ( qoff = = r - > qe - r - > qs & & toff = = r - > re - r - > rs ) ;
if ( to_shrink ) { // squeeze out zero-length operations
int32_t l = 0 ;
for ( k = 0 ; k < p - > n_cigar ; + + k ) // squeeze out zero-length operations
if ( p - > cigar [ k ] > > 4 ! = 0 )
p - > cigar [ l + + ] = p - > cigar [ k ] ;
p - > n_cigar = l ;
for ( k = l = 0 ; k < p - > n_cigar ; + + k ) // merge two adjacent operations if they are the same
if ( k = = p - > n_cigar - 1 | | ( p - > cigar [ k ] & 0xf ) ! = ( p - > cigar [ k + 1 ] & 0xf ) )
p - > cigar [ l + + ] = p - > cigar [ k ] ;
2017-10-11 09:22:37 +08:00
else p - > cigar [ k + 1 ] + = p - > cigar [ k ] > > 4 < < 4 ; // add length to the next CIGAR operator
2017-10-10 23:59:44 +08:00
p - > n_cigar = l ;
}
if ( ( p - > cigar [ 0 ] & 0xf ) = = 1 | | ( p - > cigar [ 0 ] & 0xf ) = = 2 ) { // get rid of leading I or D
int32_t l = p - > cigar [ 0 ] > > 4 ;
2017-11-09 11:31:05 +08:00
if ( ( p - > cigar [ 0 ] & 0xf ) = = 1 ) {
if ( r - > rev ) r - > qe - = l ;
else r - > qs + = l ;
* qshift = l ;
} else r - > rs + = l , * tshift = l ;
2017-10-10 23:59:44 +08:00
- - p - > n_cigar ;
memmove ( p - > cigar , p - > cigar + 1 , p - > n_cigar * 4 ) ;
}
}
2017-12-11 06:54:50 +08:00
static void mm_update_extra ( mm_reg1_t * r , const uint8_t * qseq , const uint8_t * tseq , const int8_t * mat , int8_t q , int8_t e )
2017-06-24 10:42:15 +08:00
{
2017-07-01 02:21:44 +08:00
uint32_t k , l , toff = 0 , qoff = 0 ;
2017-10-10 23:59:44 +08:00
int32_t s = 0 , max = 0 , qshift , tshift ;
mm_extra_t * p = r - > p ;
2017-07-01 02:21:44 +08:00
if ( p = = 0 ) return ;
2017-10-10 23:59:44 +08:00
mm_fix_cigar ( r , qseq , tseq , & qshift , & tshift ) ;
qseq + = qshift , tseq + = tshift ; // qseq and tseq may be shifted due to the removal of leading I/D
2017-10-16 22:55:18 +08:00
r - > blen = r - > mlen = 0 ;
2017-07-01 02:21:44 +08:00
for ( k = 0 ; k < p - > n_cigar ; + + k ) {
uint32_t op = p - > cigar [ k ] & 0xf , len = p - > cigar [ k ] > > 4 ;
2017-08-18 15:31:15 +08:00
if ( op = = 0 ) { // match/mismatch
2017-10-16 22:38:22 +08:00
int n_ambi = 0 , n_diff = 0 ;
2017-06-24 10:42:15 +08:00
for ( l = 0 ; l < len ; + + l ) {
2017-07-01 02:21:44 +08:00
int cq = qseq [ qoff + l ] , ct = tseq [ toff + l ] ;
2017-10-16 22:38:22 +08:00
if ( ct > 3 | | cq > 3 ) + + n_ambi ;
2017-11-01 10:23:27 +08:00
else if ( ct ! = cq ) + + n_diff ;
2017-07-01 02:21:44 +08:00
s + = mat [ ct * 5 + cq ] ;
if ( s < 0 ) s = 0 ;
else max = max > s ? max : s ;
2017-06-24 10:42:15 +08:00
}
2017-10-16 22:55:18 +08:00
r - > blen + = len - n_ambi , r - > mlen + = len - ( n_ambi + n_diff ) , p - > n_ambi + = n_ambi ;
toff + = len , qoff + = len ;
2017-08-18 15:31:15 +08:00
} else if ( op = = 1 ) { // insertion
2017-06-24 10:42:15 +08:00
int n_ambi = 0 ;
for ( l = 0 ; l < len ; + + l )
if ( qseq [ qoff + l ] > 3 ) + + n_ambi ;
2017-10-16 22:55:18 +08:00
r - > blen + = len - n_ambi , p - > n_ambi + = n_ambi ;
2017-07-01 02:21:44 +08:00
s - = q + e * len ;
if ( s < 0 ) s = 0 ;
2017-10-16 22:55:18 +08:00
qoff + = len ;
2017-08-18 15:31:15 +08:00
} else if ( op = = 2 ) { // deletion
2017-06-24 10:42:15 +08:00
int n_ambi = 0 ;
2017-08-18 15:31:15 +08:00
for ( l = 0 ; l < len ; + + l )
if ( tseq [ toff + l ] > 3 ) + + n_ambi ;
2017-10-16 22:55:18 +08:00
r - > blen + = len - n_ambi , p - > n_ambi + = n_ambi ;
2017-08-18 15:31:15 +08:00
s - = q + e * len ;
if ( s < 0 ) s = 0 ;
2017-10-16 22:55:18 +08:00
toff + = len ;
2017-08-18 15:31:15 +08:00
} else if ( op = = 3 ) { // intron
2017-10-06 02:37:21 +08:00
toff + = len ;
2017-06-24 10:42:15 +08:00
}
}
2017-11-10 12:20:41 +08:00
p - > dp_max = max ;
2017-10-11 09:22:37 +08:00
assert ( qoff = = r - > qe - r - > qs & & toff = = r - > re - r - > rs ) ;
2017-06-24 10:42:15 +08:00
}
2017-06-24 06:57:00 +08:00
static void mm_append_cigar ( mm_reg1_t * r , uint32_t n_cigar , uint32_t * cigar ) // TODO: this calls the libc realloc()
{
2017-06-24 10:42:15 +08:00
mm_extra_t * p ;
2017-06-24 07:21:47 +08:00
if ( n_cigar = = 0 ) return ;
2017-06-24 10:42:15 +08:00
if ( r - > p = = 0 ) {
uint32_t capacity = n_cigar + sizeof ( mm_extra_t ) ;
kroundup32 ( capacity ) ;
2017-06-24 10:53:47 +08:00
r - > p = ( mm_extra_t * ) calloc ( capacity , 4 ) ;
r - > p - > capacity = capacity ;
2017-06-24 10:42:15 +08:00
} else if ( r - > p - > n_cigar + n_cigar + sizeof ( mm_extra_t ) > r - > p - > capacity ) {
r - > p - > capacity = r - > p - > n_cigar + n_cigar + sizeof ( mm_extra_t ) ;
kroundup32 ( r - > p - > capacity ) ;
r - > p = ( mm_extra_t * ) realloc ( r - > p , r - > p - > capacity * 4 ) ;
2017-06-24 06:57:00 +08:00
}
2017-06-24 10:42:15 +08:00
p = r - > p ;
if ( p - > n_cigar > 0 & & ( p - > cigar [ p - > n_cigar - 1 ] & 0xf ) = = ( cigar [ 0 ] & 0xf ) ) { // same CIGAR op at the boundary
p - > cigar [ p - > n_cigar - 1 ] + = cigar [ 0 ] > > 4 < < 4 ;
if ( n_cigar > 1 ) memcpy ( p - > cigar + p - > n_cigar , cigar + 1 , ( n_cigar - 1 ) * 4 ) ;
p - > n_cigar + = n_cigar - 1 ;
2017-06-24 07:21:47 +08:00
} else {
2017-06-24 10:42:15 +08:00
memcpy ( p - > cigar + p - > n_cigar , cigar , n_cigar * 4 ) ;
p - > n_cigar + = n_cigar ;
2017-06-24 07:21:47 +08:00
}
2017-06-24 06:57:00 +08:00
}
2017-10-11 12:14:25 +08:00
static void mm_align_pair ( void * km , const mm_mapopt_t * opt , int qlen , const uint8_t * qseq , int tlen , const uint8_t * tseq , const int8_t * mat , int w , int end_bonus , int flag , ksw_extz_t * ez )
2017-07-08 22:26:00 +08:00
{
2017-09-20 22:51:18 +08:00
int zdrop = opt - > zdrop ;
2017-07-30 11:52:30 +08:00
if ( mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ ) {
int i ;
2017-08-11 03:04:59 +08:00
fprintf ( stderr , " ===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <=== \n " , opt - > q , opt - > q2 , opt - > e , opt - > e2 , w , flag , opt - > zdrop ) ;
2017-09-01 20:20:34 +08:00
for ( i = 0 ; i < tlen ; + + i ) fputc ( " ACGTN " [ tseq [ i ] ] , stderr ) ;
fputc ( ' \n ' , stderr ) ;
for ( i = 0 ; i < qlen ; + + i ) fputc ( " ACGTN " [ qseq [ i ] ] , stderr ) ;
fputc ( ' \n ' , stderr ) ;
2017-07-30 11:52:30 +08:00
}
2017-08-13 00:26:04 +08:00
if ( opt - > flag & MM_F_SPLICE )
2017-09-20 22:51:18 +08:00
ksw_exts2_sse ( km , qlen , qseq , tlen , tseq , 5 , mat , opt - > q , opt - > e , opt - > q2 , opt - > noncan , zdrop , flag , ez ) ;
2017-08-11 03:04:59 +08:00
else if ( opt - > q = = opt - > q2 & & opt - > e = = opt - > e2 )
2017-10-11 21:39:41 +08:00
ksw_extz2_sse ( km , qlen , qseq , tlen , tseq , 5 , mat , opt - > q , opt - > e , w , zdrop , end_bonus , flag , ez ) ;
2017-07-08 22:26:00 +08:00
else
2017-10-11 12:14:25 +08:00
ksw_extd2_sse ( km , qlen , qseq , tlen , tseq , 5 , mat , opt - > q , opt - > e , opt - > q2 , opt - > e2 , w , zdrop , end_bonus , flag , ez ) ;
2018-01-08 08:42:50 +08:00
if ( mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ ) {
int i ;
fprintf ( stderr , " score=%d, cigar= " , ez - > score ) ;
for ( i = 0 ; i < ez - > n_cigar ; + + i )
fprintf ( stderr , " %d%c " , ez - > cigar [ i ] > > 4 , " MIDN " [ ez - > cigar [ i ] & 0xf ] ) ;
fprintf ( stderr , " \n " ) ;
}
2017-07-08 22:26:00 +08:00
}
2017-06-25 07:27:58 +08:00
static inline int mm_get_hplen_back ( const mm_idx_t * mi , uint32_t rid , uint32_t x )
{
int64_t i , off0 = mi - > seq [ rid ] . offset , off = off0 + x ;
int c = mm_seq4_get ( mi - > S , off ) ;
for ( i = off - 1 ; i > = off0 ; - - i )
if ( mm_seq4_get ( mi - > S , i ) ! = c ) break ;
return ( int ) ( off - i ) ;
}
2017-06-25 07:33:59 +08:00
static inline void mm_adjust_minier ( const mm_idx_t * mi , uint8_t * const qseq0 [ 2 ] , mm128_t * a , int32_t * r , int32_t * q )
2017-06-24 21:26:24 +08:00
{
2017-11-12 08:54:06 +08:00
if ( mi - > flag & MM_I_HPC ) {
2017-06-25 07:33:59 +08:00
const uint8_t * qseq = qseq0 [ a - > x > > 63 ] ;
2017-06-25 07:27:58 +08:00
int i , c ;
2017-06-24 21:26:24 +08:00
* q = ( int32_t ) a - > y ;
2017-06-25 07:27:58 +08:00
for ( i = * q - 1 , c = qseq [ * q ] ; i > 0 ; - - i )
if ( qseq [ i ] ! = c ) break ;
2017-06-24 21:26:24 +08:00
* q = i + 1 ;
2017-06-25 07:27:58 +08:00
c = mm_get_hplen_back ( mi , a - > x < < 1 > > 33 , ( int32_t ) a - > x ) ;
* r = ( int32_t ) a - > x + 1 - c ;
2017-06-24 21:26:24 +08:00
} else {
2017-08-13 11:48:43 +08:00
* r = ( int32_t ) a - > x - ( mi - > k > > 1 ) ;
* q = ( int32_t ) a - > y - ( mi - > k > > 1 ) ;
2017-06-24 21:26:24 +08:00
}
}
2017-07-28 01:19:11 +08:00
static void mm_filter_bad_seeds ( void * km , int as1 , int cnt1 , mm128_t * a , int min_gap , int diff_thres , int max_ext_len , int max_ext_cnt )
{
2017-07-29 01:30:42 +08:00
int max_st , max_en , n , i , k , max , * K ;
2017-07-28 01:19:11 +08:00
for ( i = 1 , n = 0 ; i < cnt1 ; + + i ) { // count the number of gaps longer than min_gap
int gap = ( ( int32_t ) a [ as1 + i ] . y - a [ as1 + i - 1 ] . y ) - ( ( int32_t ) a [ as1 + i ] . x - a [ as1 + i - 1 ] . x ) ;
if ( gap < - min_gap | | gap > min_gap ) + + n ;
}
if ( n < = 1 ) return ;
K = ( int * ) kmalloc ( km , n * sizeof ( int ) ) ;
for ( i = 1 , n = 0 ; i < cnt1 ; + + i ) { // store the positions of long gaps
int gap = ( ( int32_t ) a [ as1 + i ] . y - a [ as1 + i - 1 ] . y ) - ( ( int32_t ) a [ as1 + i ] . x - a [ as1 + i - 1 ] . x ) ;
if ( gap < - min_gap | | gap > min_gap )
K [ n + + ] = i ;
}
max = 0 , max_st = max_en = - 1 ;
for ( k = 0 ; ; + + k ) { // traverse long gaps
int gap , l , n_ins = 0 , n_del = 0 , qs , rs , max_diff = 0 , max_diff_l = - 1 ;
if ( k = = n | | k > = max_en ) {
if ( max_en > 0 )
for ( i = K [ max_st ] ; i < K [ max_en ] ; + + i )
2017-07-28 23:54:18 +08:00
a [ as1 + i ] . y | = MM_SEED_IGNORE ;
2017-07-28 01:19:11 +08:00
max = 0 , max_st = max_en = - 1 ;
if ( k = = n ) break ;
}
i = K [ k ] ;
gap = ( ( int32_t ) a [ as1 + i ] . y - a [ as1 + i - 1 ] . y ) - ( ( int32_t ) a [ as1 + i ] . x - a [ as1 + i - 1 ] . x ) ;
if ( gap > 0 ) n_ins + = gap ;
else n_del + = - gap ;
qs = ( int32_t ) a [ as1 + i - 1 ] . y ;
rs = ( int32_t ) a [ as1 + i - 1 ] . x ;
for ( l = k + 1 ; l < n & & l < = k + max_ext_cnt ; + + l ) {
int j = K [ l ] , diff ;
if ( ( int32_t ) a [ as1 + j ] . y - qs > max_ext_len | | ( int32_t ) a [ as1 + j ] . x - rs > max_ext_len ) break ;
gap = ( ( int32_t ) a [ as1 + j ] . y - ( int32_t ) a [ as1 + j - 1 ] . y ) - ( a [ as1 + j ] . x - a [ as1 + j - 1 ] . x ) ;
if ( gap > 0 ) n_ins + = gap ;
else n_del + = - gap ;
diff = n_ins + n_del - abs ( n_ins - n_del ) ;
if ( max_diff < diff )
max_diff = diff , max_diff_l = l ;
}
if ( max_diff > diff_thres & & max_diff > max )
max = max_diff , max_st = k , max_en = max_diff_l ;
}
kfree ( km , K ) ;
}
static void mm_fix_bad_ends ( const mm_reg1_t * r , const mm128_t * a , int bw , int32_t * as , int32_t * cnt )
2017-07-26 07:53:19 +08:00
{
int32_t i , l ;
* as = r - > as , * cnt = r - > cnt ;
if ( r - > cnt < 3 ) return ;
l = a [ r - > as ] . y > > 32 & 0xff ;
for ( i = r - > as + 1 ; i < r - > as + r - > cnt - 1 ; + + i ) {
int32_t lq , lr , min , max ;
lr = ( int32_t ) a [ i ] . x - ( int32_t ) a [ i - 1 ] . x ;
lq = ( int32_t ) a [ i ] . y - ( int32_t ) a [ i - 1 ] . y ;
min = lr < lq ? lr : lq ;
max = lr > lq ? lr : lq ;
if ( max - min > l > > 1 ) * as = i ;
l + = min ;
if ( l > = bw < < 1 ) break ;
}
* cnt = r - > as + r - > cnt - * as ;
l = a [ r - > as + r - > cnt - 1 ] . y > > 32 & 0xff ;
for ( i = r - > as + r - > cnt - 2 ; i > * as ; - - i ) {
int32_t lq , lr , min , max ;
lr = ( int32_t ) a [ i + 1 ] . x - ( int32_t ) a [ i ] . x ;
lq = ( int32_t ) a [ i + 1 ] . y - ( int32_t ) a [ i ] . y ;
min = lr < lq ? lr : lq ;
max = lr > lq ? lr : lq ;
if ( max - min > l > > 1 ) * cnt = i + 1 - * as ;
l + = min ;
if ( l > = bw ) break ;
}
}
2017-10-09 09:36:34 +08:00
static void mm_max_stretch ( const mm_mapopt_t * opt , const mm_reg1_t * r , const mm128_t * a , int32_t * as , int32_t * cnt )
{
2017-10-10 09:13:34 +08:00
int32_t i , score , max_score , len , max_i , max_len ;
2017-10-09 09:36:34 +08:00
* as = r - > as , * cnt = r - > cnt ;
if ( r - > cnt < 2 ) return ;
max_score = - 1 , max_i = - 1 , max_len = 0 ;
score = a [ r - > as ] . y > > 32 & 0xff , len = 1 ;
for ( i = r - > as + 1 ; i < r - > as + r - > cnt ; + + i ) {
int32_t lq , lr , q_span ;
q_span = a [ i ] . y > > 32 & 0xff ;
lr = ( int32_t ) a [ i ] . x - ( int32_t ) a [ i - 1 ] . x ;
lq = ( int32_t ) a [ i ] . y - ( int32_t ) a [ i - 1 ] . y ;
2017-10-10 09:13:34 +08:00
if ( lq = = lr ) {
score + = lq < q_span ? lq : q_span ;
2017-10-09 09:36:34 +08:00
+ + len ;
} else {
if ( score > max_score )
max_score = score , max_len = len , max_i = i - len ;
score = q_span , len = 1 ;
}
}
if ( score > max_score )
max_score = score , max_len = len , max_i = i - len ;
* as = max_i , * cnt = max_len ;
}
2017-12-11 10:52:07 +08:00
static int mm_seed_ext_score ( void * km , const mm_mapopt_t * opt , const mm_idx_t * mi , const int8_t mat [ 25 ] , int qlen , uint8_t * qseq0 [ 2 ] , const mm128_t * a )
{
uint8_t * qseq , * tseq ;
int q_span = a - > y > > 32 & 0xff , qs , qe , rs , re , rid , score , q_off , t_off , ext_len = opt - > anchor_ext_len ;
void * qp ;
rid = a - > x < < 1 > > 33 ;
re = ( uint32_t ) a - > x + 1 , rs = re - q_span ;
qe = ( uint32_t ) a - > y + 1 , qs = qe - q_span ;
rs = rs - ext_len > 0 ? rs - ext_len : 0 ;
qs = qs - ext_len > 0 ? qs - ext_len : 0 ;
re = re + ext_len < mi - > seq [ rid ] . len ? re + ext_len : mi - > seq [ rid ] . len ;
qe = qe + ext_len < qlen ? qe + ext_len : qlen ;
tseq = ( uint8_t * ) kmalloc ( km , re - rs ) ;
mm_idx_getseq ( mi , rid , rs , re , tseq ) ;
qseq = qseq0 [ a - > x > > 63 ] + qs ;
qp = ksw_ll_qinit ( km , 2 , qe - qs , qseq , 5 , mat ) ;
score = ksw_ll_i16 ( qp , re - rs , tseq , opt - > q , opt - > e , & q_off , & t_off ) ;
kfree ( km , tseq ) ;
kfree ( km , qp ) ;
return score ;
}
static void mm_fix_bad_ends_splice ( void * km , const mm_mapopt_t * opt , const mm_idx_t * mi , const mm_reg1_t * r , const int8_t mat [ 25 ] , int qlen , uint8_t * qseq0 [ 2 ] , const mm128_t * a , int * as1 , int * cnt1 )
{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
int score ;
double log_gap ;
* as1 = r - > as , * cnt1 = r - > cnt ;
if ( r - > cnt < 3 ) return ;
log_gap = log ( ( int32_t ) a [ r - > as + 1 ] . x - ( int32_t ) a [ r - > as ] . x ) ;
if ( ( a [ r - > as ] . y > > 32 & 0xff ) < log_gap + opt - > anchor_ext_shift ) {
score = mm_seed_ext_score ( km , opt , mi , mat , qlen , qseq0 , & a [ r - > as ] ) ;
if ( ( double ) score / mat [ 0 ] < log_gap + opt - > anchor_ext_shift ) // a more exact format is "score < log_4(gap) + shift"
+ + ( * as1 ) , - - ( * cnt1 ) ;
}
log_gap = log ( ( int32_t ) a [ r - > as + r - > cnt - 1 ] . x - ( int32_t ) a [ r - > as + r - > cnt - 2 ] . x ) ;
if ( ( a [ r - > as + r - > cnt - 1 ] . y > > 32 & 0xff ) < log_gap + opt - > anchor_ext_shift ) {
score = mm_seed_ext_score ( km , opt , mi , mat , qlen , qseq0 , & a [ r - > as + r - > cnt - 1 ] ) ;
if ( ( double ) score / mat [ 0 ] < log_gap + opt - > anchor_ext_shift )
- - ( * cnt1 ) ;
}
}
2017-12-11 06:54:50 +08:00
static void mm_align1 ( void * km , const mm_mapopt_t * opt , const mm_idx_t * mi , int qlen , uint8_t * qseq0 [ 2 ] , mm_reg1_t * r , mm_reg1_t * r2 , int n_a , mm128_t * a , ksw_extz_t * ez , int splice_flag )
2017-06-24 03:13:53 +08:00
{
2017-10-09 09:36:34 +08:00
int is_sr = ! ! ( opt - > flag & MM_F_SR ) , is_splice = ! ! ( opt - > flag & MM_F_SPLICE ) ;
2017-10-12 09:42:11 +08:00
int32_t rid = a [ r - > as ] . x < < 1 > > 33 , rev = a [ r - > as ] . x > > 63 , as1 , cnt1 ;
2017-06-24 06:25:47 +08:00
uint8_t * tseq , * qseq ;
2017-08-14 09:37:51 +08:00
int32_t i , l , bw , dropped = 0 , extra_flag = 0 , rs0 , re0 , qs0 , qe0 ;
2017-06-24 07:21:47 +08:00
int32_t rs , re , qs , qe ;
2017-06-24 10:42:15 +08:00
int32_t rs1 , qs1 , re1 , qe1 ;
2017-06-24 06:57:00 +08:00
int8_t mat [ 25 ] ;
2017-11-12 08:54:06 +08:00
if ( is_sr ) assert ( ! ( mi - > flag & MM_I_HPC ) ) ; // HPC won't work with SR because with HPC we can't easily tell if there is a gap
2017-10-10 10:02:30 +08:00
2017-10-09 09:36:34 +08:00
r2 - > cnt = 0 ;
2017-06-30 02:54:54 +08:00
if ( r - > cnt = = 0 ) return ;
2017-06-24 06:57:00 +08:00
ksw_gen_simple_mat ( 5 , mat , opt - > a , opt - > b ) ;
2017-06-24 10:42:15 +08:00
bw = ( int ) ( opt - > bw * 1.5 + 1. ) ;
2017-06-24 03:13:53 +08:00
2017-11-12 08:54:06 +08:00
if ( is_sr & & ! ( mi - > flag & MM_I_HPC ) ) {
2017-10-09 09:36:34 +08:00
mm_max_stretch ( opt , r , a , & as1 , & cnt1 ) ;
2017-10-10 09:13:34 +08:00
rs = ( int32_t ) a [ as1 ] . x + 1 - ( int32_t ) ( a [ as1 ] . y > > 32 & 0xff ) ;
qs = ( int32_t ) a [ as1 ] . y + 1 - ( int32_t ) ( a [ as1 ] . y > > 32 & 0xff ) ;
re = ( int32_t ) a [ as1 + cnt1 - 1 ] . x + 1 ;
qe = ( int32_t ) a [ as1 + cnt1 - 1 ] . y + 1 ;
2017-10-09 09:36:34 +08:00
} else {
2017-12-11 10:52:07 +08:00
if ( is_splice ) {
mm_fix_bad_ends_splice ( km , opt , mi , r , mat , qlen , qseq0 , a , & as1 , & cnt1 ) ;
} else {
2017-10-09 09:36:34 +08:00
mm_fix_bad_ends ( r , a , opt - > bw , & as1 , & cnt1 ) ;
2017-12-11 10:52:07 +08:00
}
2017-10-09 09:36:34 +08:00
mm_filter_bad_seeds ( km , as1 , cnt1 , a , 10 , 40 , opt - > max_gap > > 1 , 10 ) ;
2017-10-10 09:13:34 +08:00
mm_adjust_minier ( mi , qseq0 , & a [ as1 ] , & rs , & qs ) ;
mm_adjust_minier ( mi , qseq0 , & a [ as1 + cnt1 - 1 ] , & re , & qe ) ;
2017-10-09 09:36:34 +08:00
}
assert ( cnt1 > 0 ) ;
if ( is_splice ) {
2017-08-17 18:02:44 +08:00
if ( splice_flag & MM_F_SPLICE_FOR ) extra_flag | = rev ? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR ;
if ( splice_flag & MM_F_SPLICE_REV ) extra_flag | = rev ? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV ;
2017-10-28 12:25:01 +08:00
if ( opt - > flag & MM_F_SPLICE_FLANK ) extra_flag | = KSW_EZ_SPLICE_FLANK ;
2017-08-14 09:37:51 +08:00
}
2017-10-12 09:42:11 +08:00
/* Look for the start and end of regions to perform DP. This sounds easy
* but is in fact tricky . Excessively small regions lead to unnecessary
* clippings and lose alignable sequences . Excessively large regions
* occasionally lead to large overlaps between two chains and may cause
* loss of alignments in corner cases . */
if ( is_sr ) {
qs0 = 0 , qe0 = qlen ;
l = qs ;
2017-10-16 22:38:22 +08:00
l + = l * opt - > a + opt - > end_bonus > opt - > q ? ( l * opt - > a + opt - > end_bonus - opt - > q ) / opt - > e : 0 ;
2017-10-12 09:42:11 +08:00
rs0 = rs - l > 0 ? rs - l : 0 ;
l = qlen - qe ;
2017-10-16 22:38:22 +08:00
l + = l * opt - > a + opt - > end_bonus > opt - > q ? ( l * opt - > a + opt - > end_bonus - opt - > q ) / opt - > e : 0 ;
2017-10-12 09:42:11 +08:00
re0 = re + l < mi - > seq [ rid ] . len ? re + l : mi - > seq [ rid ] . len ;
} else {
// compute rs0 and qs0
rs0 = ( int32_t ) a [ r - > as ] . x + 1 - ( int32_t ) ( a [ r - > as ] . y > > 32 & 0xff ) ;
qs0 = ( int32_t ) a [ r - > as ] . y + 1 - ( int32_t ) ( a [ r - > as ] . y > > 32 & 0xff ) ;
2017-10-22 07:54:04 +08:00
if ( rs0 < 0 ) rs0 = 0 ; // this may happen when HPC is in use
assert ( qs0 > = 0 ) ; // this should never happen, or it is logic error
2017-10-12 09:42:11 +08:00
rs1 = qs1 = 0 ;
2017-10-12 09:54:32 +08:00
for ( i = r - > as - 1 , l = 0 ; i > = 0 & & a [ i ] . x > > 32 = = a [ r - > as ] . x > > 32 ; - - i ) { // inspect nearby seeds
int32_t x = ( int32_t ) a [ i ] . x + 1 - ( int32_t ) ( a [ i ] . y > > 32 & 0xff ) ;
int32_t y = ( int32_t ) a [ i ] . y + 1 - ( int32_t ) ( a [ i ] . y > > 32 & 0xff ) ;
if ( x < rs0 & & y < qs0 ) {
if ( + + l > opt - > min_cnt ) {
l = rs0 - x > qs0 - y ? rs0 - x : qs0 - y ;
rs1 = rs0 - l , qs1 = qs0 - l ;
break ;
2017-10-12 09:42:11 +08:00
}
}
}
2017-10-12 09:54:32 +08:00
if ( qs > 0 & & rs > 0 ) {
2017-10-12 09:42:11 +08:00
l = qs < opt - > max_gap ? qs : opt - > max_gap ;
qs1 = qs1 > qs - l ? qs1 : qs - l ;
qs0 = qs0 < qs1 ? qs0 : qs1 ; // at least include qs0
l + = l * opt - > a > opt - > q ? ( l * opt - > a - opt - > q ) / opt - > e : 0 ;
l = l < opt - > max_gap ? l : opt - > max_gap ;
l = l < rs ? l : rs ;
rs1 = rs1 > rs - l ? rs1 : rs - l ;
rs0 = rs0 < rs1 ? rs0 : rs1 ;
} else rs0 = rs , qs0 = qs ;
// compute re0 and qe0
re0 = ( int32_t ) a [ r - > as + r - > cnt - 1 ] . x + 1 ;
qe0 = ( int32_t ) a [ r - > as + r - > cnt - 1 ] . y + 1 ;
re1 = mi - > seq [ rid ] . len , qe1 = qlen ;
2017-10-12 09:54:32 +08:00
for ( i = r - > as + r - > cnt , l = 0 ; i < n_a & & a [ i ] . x > > 32 = = a [ r - > as ] . x > > 32 ; + + i ) { // inspect nearby seeds
int32_t x = ( int32_t ) a [ i ] . x + 1 ;
int32_t y = ( int32_t ) a [ i ] . y + 1 ;
if ( x > re0 & & y > qe0 ) {
if ( + + l > opt - > min_cnt ) {
l = x - re0 > y - qe0 ? x - re0 : y - qe0 ;
re1 = re0 + l , qe1 = qe0 + l ;
break ;
2017-10-12 09:42:11 +08:00
}
}
}
if ( qe < qlen & & re < mi - > seq [ rid ] . len ) {
l = qlen - qe < opt - > max_gap ? qlen - qe : opt - > max_gap ;
qe1 = qe1 < qe + l ? qe1 : qe + l ;
qe0 = qe0 > qe1 ? qe0 : qe1 ; // at least include qe0
l + = l * opt - > a > opt - > q ? ( l * opt - > a - opt - > q ) / opt - > e : 0 ;
l = l < opt - > max_gap ? l : opt - > max_gap ;
l = l < mi - > seq [ rid ] . len - re ? l : mi - > seq [ rid ] . len - re ;
re1 = re1 < re + l ? re1 : re + l ;
re0 = re0 > re1 ? re0 : re1 ;
} else re0 = re , qe0 = qe ;
}
2018-01-31 09:05:02 +08:00
if ( a [ r - > as ] . y & MM_SEED_SELF ) {
int max_ext = r - > qs > r - > rs ? r - > qs - r - > rs : r - > rs - r - > qs ;
if ( r - > rs - rs0 > max_ext ) rs0 = r - > rs - max_ext ;
if ( r - > qs - qs0 > max_ext ) qs0 = r - > qs - max_ext ;
max_ext = r - > qe > r - > re ? r - > qe - r - > re : r - > re - r - > qe ;
if ( re0 - r - > re > max_ext ) re0 = r - > re + max_ext ;
if ( qe0 - r - > qe > max_ext ) qe0 = r - > qe + max_ext ;
}
2017-06-24 03:13:53 +08:00
2017-07-01 02:21:44 +08:00
assert ( re0 > rs0 ) ;
tseq = ( uint8_t * ) kmalloc ( km , re0 - rs0 ) ;
2017-06-24 06:25:47 +08:00
if ( qs > 0 & & rs > 0 ) { // left extension
qseq = & qseq0 [ rev ] [ qs0 ] ;
2017-06-24 07:21:47 +08:00
mm_idx_getseq ( mi , rid , rs0 , rs , tseq ) ;
2017-06-25 07:27:58 +08:00
mm_seq_rev ( qs - qs0 , qseq ) ;
mm_seq_rev ( rs - rs0 , tseq ) ;
2017-10-11 12:14:25 +08:00
mm_align_pair ( km , opt , qs - qs0 , qseq , rs - rs0 , tseq , mat , bw , opt - > end_bonus , extra_flag | KSW_EZ_EXTZ_ONLY | KSW_EZ_RIGHT | KSW_EZ_REV_CIGAR , ez ) ;
2017-06-27 01:56:25 +08:00
if ( ez - > n_cigar > 0 ) {
mm_append_cigar ( r , ez - > n_cigar , ez - > cigar ) ;
2017-07-01 02:21:44 +08:00
r - > p - > dp_score + = ez - > max ;
2017-06-27 01:56:25 +08:00
}
2017-10-11 12:14:25 +08:00
rs1 = rs - ( ez - > reach_end ? ez - > mqe_t + 1 : ez - > max_t + 1 ) ;
qs1 = qs - ( ez - > reach_end ? qs - qs0 : ez - > max_q + 1 ) ;
2017-06-25 07:27:58 +08:00
mm_seq_rev ( qs - qs0 , qseq ) ;
2017-06-24 07:21:47 +08:00
} else rs1 = rs , qs1 = qs ;
2017-06-27 02:45:23 +08:00
re1 = rs , qe1 = qs ;
2017-06-24 10:42:15 +08:00
assert ( qs1 > = 0 & & rs1 > = 0 ) ;
2017-06-24 07:21:47 +08:00
2017-10-10 10:02:30 +08:00
for ( i = is_sr ? cnt1 - 1 : 1 ; i < cnt1 ; + + i ) { // gap filling
if ( ( a [ as1 + i ] . y & ( MM_SEED_IGNORE | MM_SEED_TANDEM ) ) & & i ! = cnt1 - 1 ) continue ;
2017-11-12 08:54:06 +08:00
if ( is_sr & & ! ( mi - > flag & MM_I_HPC ) ) {
2017-10-10 10:02:30 +08:00
re = ( int32_t ) a [ as1 + i ] . x + 1 ;
qe = ( int32_t ) a [ as1 + i ] . y + 1 ;
} else mm_adjust_minier ( mi , qseq0 , & a [ as1 + i ] , & re , & qe ) ;
re1 = re , qe1 = qe ;
if ( i = = cnt1 - 1 | | ( a [ as1 + i ] . y & MM_SEED_LONG_JOIN ) | | ( qe - qs > = opt - > min_ksw_len & & re - rs > = opt - > min_ksw_len ) ) {
int j , bw1 = bw ;
if ( a [ as1 + i ] . y & MM_SEED_LONG_JOIN )
bw1 = qe - qs > re - rs ? qe - qs : re - rs ;
qseq = & qseq0 [ rev ] [ qs ] ;
mm_idx_getseq ( mi , rid , rs , re , tseq ) ;
if ( is_sr ) { // perform ungapped alignment
assert ( qe - qs = = re - rs ) ;
ksw_reset_extz ( ez ) ;
for ( j = 0 , ez - > score = 0 ; j < qe - qs ; + + j ) {
if ( qseq [ j ] > = 4 | | tseq [ j ] > = 4 ) ez - > score + = opt - > e2 ;
else ez - > score + = qseq [ j ] = = tseq [ j ] ? opt - > a : - opt - > b ;
}
ez - > cigar = ksw_push_cigar ( km , & ez - > n_cigar , & ez - > m_cigar , ez - > cigar , 0 , qe - qs ) ;
} else { // perform normal gapped alignment
2017-10-11 12:14:25 +08:00
mm_align_pair ( km , opt , qe - qs , qseq , re - rs , tseq , mat , bw1 , - 1 , extra_flag | KSW_EZ_APPROX_MAX , ez ) ; // first pass: with approximate Z-drop
2017-10-09 09:36:34 +08:00
}
2017-10-10 10:02:30 +08:00
if ( mm_check_zdrop ( qseq , tseq , ez - > n_cigar , ez - > cigar , mat , opt - > q , opt - > e , opt - > zdrop ) )
2017-10-11 12:14:25 +08:00
mm_align_pair ( km , opt , qe - qs , qseq , re - rs , tseq , mat , bw1 , - 1 , extra_flag , ez ) ; // second pass: lift approximate
2017-10-10 10:02:30 +08:00
if ( ez - > n_cigar > 0 )
mm_append_cigar ( r , ez - > n_cigar , ez - > cigar ) ;
if ( ez - > zdropped ) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
for ( j = i - 1 ; j > = 0 ; - - j )
2017-10-18 03:52:36 +08:00
if ( ( int32_t ) a [ as1 + j ] . x < = rs + ez - > max_t )
2017-10-10 10:02:30 +08:00
break ;
dropped = 1 ;
2017-10-18 03:52:36 +08:00
if ( j < 0 ) j = 0 ;
2017-10-10 10:02:30 +08:00
r - > p - > dp_score + = ez - > max ;
re1 = rs + ( ez - > max_t + 1 ) ;
qe1 = qs + ( ez - > max_q + 1 ) ;
if ( cnt1 - ( j + 1 ) > = opt - > min_cnt )
2017-10-18 03:52:36 +08:00
mm_split_reg ( r , r2 , as1 + j + 1 - r - > as , qlen , a ) ;
2017-10-10 10:02:30 +08:00
break ;
} else r - > p - > dp_score + = ez - > score ;
rs = re , qs = qe ;
2017-06-24 03:13:53 +08:00
}
}
2017-06-24 07:21:47 +08:00
2017-06-30 23:52:42 +08:00
if ( ! dropped & & qe < qe0 & & re < re0 ) { // right extension
2017-06-24 10:42:15 +08:00
qseq = & qseq0 [ rev ] [ qe ] ;
mm_idx_getseq ( mi , rid , re , re0 , tseq ) ;
2017-10-11 12:14:25 +08:00
mm_align_pair ( km , opt , qe0 - qe , qseq , re0 - re , tseq , mat , bw , opt - > end_bonus , extra_flag | KSW_EZ_EXTZ_ONLY , ez ) ;
2017-06-27 01:56:25 +08:00
if ( ez - > n_cigar > 0 ) {
mm_append_cigar ( r , ez - > n_cigar , ez - > cigar ) ;
2017-07-01 02:21:44 +08:00
r - > p - > dp_score + = ez - > max ;
2017-06-27 01:56:25 +08:00
}
2017-10-11 12:14:25 +08:00
re1 = re + ( ez - > reach_end ? ez - > mqe_t + 1 : ez - > max_t + 1 ) ;
qe1 = qe + ( ez - > reach_end ? qe0 - qe : ez - > max_q + 1 ) ;
2017-06-25 22:52:14 +08:00
}
2017-06-24 10:42:15 +08:00
assert ( qe1 < = qlen ) ;
r - > rs = rs1 , r - > re = re1 ;
if ( rev ) r - > qs = qlen - qe1 , r - > qe = qlen - qs1 ;
else r - > qs = qs1 , r - > qe = qe1 ;
2017-07-01 02:21:44 +08:00
assert ( re1 - rs1 < = re0 - rs0 ) ;
if ( r - > p ) {
mm_idx_getseq ( mi , rid , rs1 , re1 , tseq ) ;
2017-12-11 06:54:50 +08:00
mm_update_extra ( r , & qseq0 [ r - > rev ] [ qs1 ] , tseq , mat , opt - > q , opt - > e ) ;
2017-08-17 18:02:44 +08:00
if ( rev & & r - > p - > trans_strand )
r - > p - > trans_strand ^ = 3 ; // flip to the read strand
2017-07-01 02:21:44 +08:00
}
2017-06-24 06:25:47 +08:00
kfree ( km , tseq ) ;
2017-06-24 03:13:53 +08:00
}
2017-12-11 06:54:50 +08:00
static int mm_align1_inv ( void * km , const mm_mapopt_t * opt , const mm_idx_t * mi , int qlen , uint8_t * qseq0 [ 2 ] , const mm_reg1_t * r1 , const mm_reg1_t * r2 , mm_reg1_t * r_inv , ksw_extz_t * ez )
2017-07-29 23:01:49 +08:00
{
2017-07-30 05:40:53 +08:00
int tl , ql , score , ret = 0 , q_off , t_off ;
2017-12-11 06:54:50 +08:00
uint8_t * tseq , * qseq ;
2017-07-29 23:01:49 +08:00
int8_t mat [ 25 ] ;
2017-07-30 05:40:53 +08:00
void * qp ;
2017-07-29 23:01:49 +08:00
memset ( r_inv , 0 , sizeof ( mm_reg1_t ) ) ;
2017-07-30 01:09:10 +08:00
if ( ! ( r1 - > split & 1 ) | | ! ( r2 - > split & 2 ) ) return 0 ;
2017-07-30 02:09:35 +08:00
if ( r1 - > id ! = r1 - > parent & & r1 - > parent ! = MM_PARENT_TMP_PRI ) return 0 ;
if ( r2 - > id ! = r2 - > parent & & r2 - > parent ! = MM_PARENT_TMP_PRI ) return 0 ;
2017-07-30 01:09:10 +08:00
if ( r1 - > rid ! = r2 - > rid | | r1 - > rev ! = r2 - > rev ) return 0 ;
ql = r2 - > qs - r1 - > qe ;
tl = r2 - > rs - r1 - > re ;
2017-07-30 08:21:53 +08:00
if ( ql < opt - > min_chain_score | | ql > opt - > max_gap ) return 0 ;
if ( tl < opt - > min_chain_score | | tl > opt - > max_gap ) return 0 ;
2017-07-30 05:40:53 +08:00
2017-07-29 23:01:49 +08:00
ksw_gen_simple_mat ( 5 , mat , opt - > a , opt - > b ) ;
2017-07-30 01:09:10 +08:00
tseq = ( uint8_t * ) kmalloc ( km , tl ) ;
2017-07-29 23:01:49 +08:00
mm_idx_getseq ( mi , r1 - > rid , r1 - > re , r2 - > rs , tseq ) ;
2017-07-30 01:09:10 +08:00
qseq = & qseq0 [ ! r1 - > rev ] [ qlen - r2 - > qs ] ;
2017-07-30 05:40:53 +08:00
mm_seq_rev ( ql , qseq ) ;
mm_seq_rev ( tl , tseq ) ;
qp = ksw_ll_qinit ( km , 2 , ql , qseq , 5 , mat ) ;
score = ksw_ll_i16 ( qp , tl , tseq , opt - > q , opt - > e , & q_off , & t_off ) ;
kfree ( km , qp ) ;
mm_seq_rev ( ql , qseq ) ;
mm_seq_rev ( tl , tseq ) ;
if ( score < opt - > min_dp_max ) goto end_align1_inv ;
q_off = ql - ( q_off + 1 ) , t_off = tl - ( t_off + 1 ) ;
2017-10-11 12:14:25 +08:00
mm_align_pair ( km , opt , ql - q_off , qseq + q_off , tl - t_off , tseq + t_off , mat , ( int ) ( opt - > bw * 1.5 ) , - 1 , KSW_EZ_EXTZ_ONLY , ez ) ;
2017-07-30 05:40:53 +08:00
if ( ez - > n_cigar = = 0 ) goto end_align1_inv ; // should never be here
mm_append_cigar ( r_inv , ez - > n_cigar , ez - > cigar ) ;
r_inv - > p - > dp_score = ez - > max ;
r_inv - > id = - 1 ;
r_inv - > parent = MM_PARENT_UNSET ;
r_inv - > inv = 1 ;
r_inv - > rev = ! r1 - > rev ;
2017-10-04 23:32:06 +08:00
r_inv - > rid = r1 - > rid ;
2017-12-09 02:16:18 +08:00
r_inv - > div = - 1.0f ;
2017-07-30 05:40:53 +08:00
r_inv - > qs = r1 - > qe + q_off , r_inv - > qe = r_inv - > qs + ez - > max_q + 1 ;
r_inv - > rs = r1 - > re + t_off , r_inv - > re = r_inv - > rs + ez - > max_t + 1 ;
2017-12-11 06:54:50 +08:00
mm_update_extra ( r_inv , & qseq [ q_off ] , & tseq [ t_off ] , mat , opt - > q , opt - > e ) ;
2017-07-30 05:40:53 +08:00
ret = 1 ;
end_align1_inv :
2017-07-30 01:09:10 +08:00
kfree ( km , tseq ) ;
return ret ;
}
static inline mm_reg1_t * mm_insert_reg ( const mm_reg1_t * r , int i , int * n_regs , mm_reg1_t * regs )
{
regs = ( mm_reg1_t * ) realloc ( regs , ( * n_regs + 1 ) * sizeof ( mm_reg1_t ) ) ;
if ( i + 1 ! = * n_regs )
memmove ( & regs [ i + 2 ] , & regs [ i + 1 ] , sizeof ( mm_reg1_t ) * ( * n_regs - i - 1 ) ) ;
regs [ i + 1 ] = * r ;
+ + * n_regs ;
return regs ;
2017-07-29 23:01:49 +08:00
}
2017-12-11 06:54:50 +08:00
mm_reg1_t * mm_align_skeleton ( void * km , const mm_mapopt_t * opt , const mm_idx_t * mi , int qlen , const char * qstr , int * n_regs_ , mm_reg1_t * regs , mm128_t * a )
2017-06-24 01:44:45 +08:00
{
extern unsigned char seq_nt4_table [ 256 ] ;
2017-10-11 05:25:12 +08:00
int32_t i , n_regs = * n_regs_ , n_a ;
2017-12-11 06:54:50 +08:00
uint8_t * qseq0 [ 2 ] ;
2017-06-24 06:57:00 +08:00
ksw_extz_t ez ;
2017-06-24 01:44:45 +08:00
2017-06-28 22:35:21 +08:00
// encode the query sequence
2017-10-16 22:38:22 +08:00
qseq0 [ 0 ] = ( uint8_t * ) kmalloc ( km , qlen * 2 ) ;
qseq0 [ 1 ] = qseq0 [ 0 ] + qlen ;
2017-06-24 01:44:45 +08:00
for ( i = 0 ; i < qlen ; + + i ) {
qseq0 [ 0 ] [ i ] = seq_nt4_table [ ( uint8_t ) qstr [ i ] ] ;
qseq0 [ 1 ] [ qlen - 1 - i ] = qseq0 [ 0 ] [ i ] < 4 ? 3 - qseq0 [ 0 ] [ i ] : 4 ;
}
2017-06-28 22:35:21 +08:00
// align through seed hits
2017-10-12 09:42:11 +08:00
n_a = mm_squeeze_a ( km , n_regs , regs , a ) ;
2017-06-28 22:35:21 +08:00
memset ( & ez , 0 , sizeof ( ksw_extz_t ) ) ;
2017-07-29 22:31:46 +08:00
for ( i = 0 ; i < n_regs ; + + i ) {
2017-06-24 06:25:47 +08:00
mm_reg1_t r2 ;
2017-09-20 23:11:53 +08:00
if ( ( opt - > flag & MM_F_SPLICE ) & & ( opt - > flag & MM_F_SPLICE_FOR ) & & ( opt - > flag & MM_F_SPLICE_REV ) ) { // then do two rounds of alignments for both strands
2017-08-17 18:02:44 +08:00
mm_reg1_t s [ 2 ] , s2 [ 2 ] ;
int which , trans_strand ;
s [ 0 ] = s [ 1 ] = regs [ i ] ;
2017-12-11 06:54:50 +08:00
mm_align1 ( km , opt , mi , qlen , qseq0 , & s [ 0 ] , & s2 [ 0 ] , n_a , a , & ez , MM_F_SPLICE_FOR ) ;
mm_align1 ( km , opt , mi , qlen , qseq0 , & s [ 1 ] , & s2 [ 1 ] , n_a , a , & ez , MM_F_SPLICE_REV ) ;
2017-08-17 18:02:44 +08:00
if ( s [ 0 ] . p - > dp_score > s [ 1 ] . p - > dp_score ) which = 0 , trans_strand = 1 ;
else if ( s [ 0 ] . p - > dp_score < s [ 1 ] . p - > dp_score ) which = 1 , trans_strand = 2 ;
else trans_strand = 3 , which = ( qlen + s [ 0 ] . p - > dp_score ) & 1 ; // randomly choose a strand, effectively
if ( which = = 0 ) {
regs [ i ] = s [ 0 ] , r2 = s2 [ 0 ] ;
free ( s [ 1 ] . p ) ;
} else {
regs [ i ] = s [ 1 ] , r2 = s2 [ 1 ] ;
free ( s [ 0 ] . p ) ;
}
regs [ i ] . p - > trans_strand = trans_strand ;
2017-09-20 23:11:53 +08:00
} else { // one round of alignment
2017-12-11 06:54:50 +08:00
mm_align1 ( km , opt , mi , qlen , qseq0 , & regs [ i ] , & r2 , n_a , a , & ez , opt - > flag ) ;
2017-09-20 23:11:53 +08:00
if ( opt - > flag & MM_F_SPLICE )
2017-08-17 18:02:44 +08:00
regs [ i ] . p - > trans_strand = opt - > flag & MM_F_SPLICE_FOR ? 1 : 2 ;
}
2017-07-30 01:09:10 +08:00
if ( r2 . cnt > 0 ) regs = mm_insert_reg ( & r2 , i , & n_regs , regs ) ;
2018-01-16 23:34:30 +08:00
if ( ! ( opt - > flag & ( MM_F_SPLICE | MM_F_SR ) ) & & ! ( opt - > flag & ( MM_F_FOR_ONLY | MM_F_REV_ONLY ) ) & & i > 0 ) { // don't try inversion alignment for -xsplice or -xsr, or --for-only/rev-only
2017-12-11 06:54:50 +08:00
if ( mm_align1_inv ( km , opt , mi , qlen , qseq0 , & regs [ i - 1 ] , & regs [ i ] , & r2 , & ez ) ) {
2017-10-06 04:15:14 +08:00
regs = mm_insert_reg ( & r2 , i , & n_regs , regs ) ;
+ + i ; // skip the inserted INV alignment
}
2017-07-29 23:01:49 +08:00
}
2017-06-24 01:44:45 +08:00
}
2017-07-01 10:15:45 +08:00
* n_regs_ = n_regs ;
2017-10-16 22:38:22 +08:00
kfree ( km , qseq0 [ 0 ] ) ;
2017-06-28 22:35:21 +08:00
kfree ( km , ez . cigar ) ;
2017-07-01 10:15:45 +08:00
mm_filter_regs ( km , opt , n_regs_ , regs ) ;
2017-07-03 22:44:26 +08:00
mm_hit_sort_by_dp ( km , n_regs_ , regs ) ;
2017-06-25 10:57:43 +08:00
return regs ;
2017-06-24 01:44:45 +08:00
}