2013-02-01 02:59:48 +08:00
# include <stdlib.h>
2013-02-01 04:55:22 +08:00
# include <string.h>
# include <stdio.h>
2013-02-05 01:37:38 +08:00
# include <assert.h>
2013-02-08 06:15:45 +08:00
# include <math.h>
2013-02-08 02:29:01 +08:00
# ifdef HAVE_PTHREAD
# include <pthread.h>
# endif
2013-02-25 02:09:29 +08:00
2013-02-08 03:36:18 +08:00
# include "kstring.h"
2013-02-01 02:59:48 +08:00
# include "bwamem.h"
2013-02-02 05:39:50 +08:00
# include "bntseq.h"
2013-02-05 01:37:38 +08:00
# include "ksw.h"
2013-02-12 23:36:15 +08:00
# include "kvec.h"
2013-02-05 06:23:06 +08:00
# include "ksort.h"
2013-02-27 11:55:44 +08:00
# include "utils.h"
2013-02-05 01:37:38 +08:00
2013-05-02 22:12:01 +08:00
# ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
# endif
2013-02-20 14:11:38 +08:00
/* Theory on probability and scoring *ungapped* alignment
*
* s ' ( a , b ) = log [ P ( b | a ) / P ( b ) ] = log [ 4 P ( b | a ) ] , assuming uniform base distribution
* s ' ( a , a ) = log ( 4 ) , s ' ( a , b ) = log ( 4 e / 3 ) , where e is the error rate
*
* Scale s ' ( a , b ) to s ( a , a ) s . t . s ( a , a ) = x . Then s ( a , b ) = x * s ' ( a , b ) / log ( 4 ) , or conversely : s ' ( a , b ) = s ( a , b ) * log ( 4 ) / x
*
* If the matching score is x and mismatch penalty is - y , we can compute error rate e :
* e = .75 * exp [ - log ( 4 ) * y / x ]
*
* log P ( seq ) = \ sum_i log P ( b_i | a_i ) = \ sum_i { s ' ( a , b ) - log ( 4 ) }
* = \ sum_i { s ( a , b ) * log ( 4 ) / x - log ( 4 ) } = log ( 4 ) * ( S / x - l )
*
* where S = \ sum_i s ( a , b ) is the alignment score . Converting to the phred scale :
* Q ( seq ) = - 10 / log ( 10 ) * log P ( seq ) = 10 * log ( 4 ) / log ( 10 ) * ( l - S / x ) = 6.02 * ( l - S / x )
*
*
* Gap open ( zero gap ) : q ' = log [ P ( gap - open ) ] , r ' = log [ P ( gap - ext ) ] ( see Durbin et al . ( 1998 ) Section 4.1 )
* Then q = x * log [ P ( gap - open ) ] / log ( 4 ) , r = x * log [ P ( gap - ext ) ] / log ( 4 )
2013-02-21 08:11:44 +08:00
*
* When there are gaps , l should be the length of alignment matches ( i . e . the M operator in CIGAR )
2013-02-20 14:11:38 +08:00
*/
2013-02-02 05:39:50 +08:00
mem_opt_t * mem_opt_init ( )
2013-02-01 02:59:48 +08:00
{
2013-02-02 05:39:50 +08:00
mem_opt_t * o ;
2013-05-02 22:12:01 +08:00
o = calloc ( 1 , sizeof ( mem_opt_t ) ) ;
2013-02-19 05:33:06 +08:00
o - > flag = 0 ;
2013-02-28 10:13:39 +08:00
o - > a = 1 ; o - > b = 4 ; o - > q = 6 ; o - > r = 1 ; o - > w = 100 ;
2013-03-06 11:49:38 +08:00
o - > T = 30 ;
2013-03-05 13:34:33 +08:00
o - > zdrop = 100 ;
2013-04-18 04:50:20 +08:00
o - > pen_unpaired = 17 ;
2013-08-29 03:59:05 +08:00
o - > pen_clip5 = o - > pen_clip3 = 5 ;
2013-02-08 08:50:37 +08:00
o - > min_seed_len = 19 ;
2013-02-09 05:56:28 +08:00
o - > split_width = 10 ;
o - > max_occ = 10000 ;
2013-02-01 04:55:22 +08:00
o - > max_chain_gap = 10000 ;
2013-02-11 23:59:38 +08:00
o - > max_ins = 10000 ;
2013-02-05 13:17:20 +08:00
o - > mask_level = 0.50 ;
2013-02-05 13:41:07 +08:00
o - > chain_drop_ratio = 0.50 ;
2013-02-22 01:52:00 +08:00
o - > split_factor = 1.5 ;
2013-02-08 02:13:43 +08:00
o - > chunk_size = 10000000 ;
o - > n_threads = 1 ;
2013-02-26 00:56:02 +08:00
o - > max_matesw = 100 ;
2013-09-09 23:36:50 +08:00
o - > mask_level_redun = 0.95 ;
2013-11-03 00:25:53 +08:00
o - > mapQ_coef_len = 50 ; o - > mapQ_coef_fac = log ( o - > mapQ_coef_len ) ;
// o->mapQ_coef_len = o->mapQ_coef_fac = 0;
2013-03-05 22:38:12 +08:00
bwa_fill_scmat ( o - > a , o - > b , o - > mat ) ;
2013-02-01 02:59:48 +08:00
return o ;
}
/***************************
* SMEM iterator interface *
* * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-02 03:38:44 +08:00
struct __smem_i {
const bwt_t * bwt ;
const uint8_t * query ;
int start , len ;
bwtintv_v * matches ; // matches; to be returned by smem_next()
bwtintv_v * sub ; // sub-matches inside the longest match; temporary
bwtintv_v * tmpvec [ 2 ] ; // temporary arrays
} ;
2013-02-01 02:59:48 +08:00
smem_i * smem_itr_init ( const bwt_t * bwt )
{
smem_i * itr ;
2013-05-02 22:12:01 +08:00
itr = calloc ( 1 , sizeof ( smem_i ) ) ;
2013-02-01 02:59:48 +08:00
itr - > bwt = bwt ;
2013-05-02 22:12:01 +08:00
itr - > tmpvec [ 0 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > tmpvec [ 1 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > matches = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > sub = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
2013-02-01 02:59:48 +08:00
return itr ;
}
void smem_itr_destroy ( smem_i * itr )
{
2013-02-02 04:26:34 +08:00
free ( itr - > tmpvec [ 0 ] - > a ) ; free ( itr - > tmpvec [ 0 ] ) ;
free ( itr - > tmpvec [ 1 ] - > a ) ; free ( itr - > tmpvec [ 1 ] ) ;
free ( itr - > matches - > a ) ; free ( itr - > matches ) ;
free ( itr - > sub - > a ) ; free ( itr - > sub ) ;
2013-02-01 02:59:48 +08:00
free ( itr ) ;
}
2013-02-02 03:20:38 +08:00
void smem_set_query ( smem_i * itr , int len , const uint8_t * query )
2013-02-01 02:59:48 +08:00
{
itr - > query = query ;
itr - > start = 0 ;
itr - > len = len ;
}
2013-02-09 05:56:28 +08:00
const bwtintv_v * smem_next ( smem_i * itr , int split_len , int split_width )
2013-02-01 02:59:48 +08:00
{
2013-02-09 05:56:28 +08:00
int i , max , max_i , ori_start ;
2013-02-02 03:38:44 +08:00
itr - > tmpvec [ 0 ] - > n = itr - > tmpvec [ 1 ] - > n = itr - > matches - > n = itr - > sub - > n = 0 ;
if ( itr - > start > = itr - > len | | itr - > start < 0 ) return 0 ;
2013-02-01 02:59:48 +08:00
while ( itr - > start < itr - > len & & itr - > query [ itr - > start ] > 3 ) + + itr - > start ; // skip ambiguous bases
2013-02-02 03:38:44 +08:00
if ( itr - > start = = itr - > len ) return 0 ;
2013-02-09 05:56:28 +08:00
ori_start = itr - > start ;
itr - > start = bwt_smem1 ( itr - > bwt , itr - > len , itr - > query , ori_start , 1 , itr - > matches , itr - > tmpvec ) ; // search for SMEM
2013-02-02 03:38:44 +08:00
if ( itr - > matches - > n = = 0 ) return itr - > matches ; // well, in theory, we should never come here
for ( i = max = 0 , max_i = 0 ; i < itr - > matches - > n ; + + i ) { // look for the longest match
2013-02-02 03:20:38 +08:00
bwtintv_t * p = & itr - > matches - > a [ i ] ;
int len = ( uint32_t ) p - > info - ( p - > info > > 32 ) ;
if ( max < len ) max = len , max_i = i ;
}
2013-02-09 05:56:28 +08:00
if ( split_len > 0 & & max > = split_len & & itr - > matches - > a [ max_i ] . x [ 2 ] < = split_width ) { // if the longest SMEM is unique and long
2013-02-02 03:20:38 +08:00
int j ;
2013-02-02 03:38:44 +08:00
bwtintv_v * a = itr - > tmpvec [ 0 ] ; // reuse tmpvec[0] for merging
2013-02-02 03:20:38 +08:00
bwtintv_t * p = & itr - > matches - > a [ max_i ] ;
2013-02-09 05:56:28 +08:00
bwt_smem1 ( itr - > bwt , itr - > len , itr - > query , ( ( uint32_t ) p - > info + ( p - > info > > 32 ) ) > > 1 , itr - > matches - > a [ max_i ] . x [ 2 ] + 1 , itr - > sub , itr - > tmpvec ) ; // starting from the middle of the longest MEM
2013-02-02 03:20:38 +08:00
i = j = 0 ; a - > n = 0 ;
while ( i < itr - > matches - > n & & j < itr - > sub - > n ) { // ordered merge
2013-02-05 05:48:11 +08:00
int64_t xi = itr - > matches - > a [ i ] . info > > 32 < < 32 | ( itr - > len - ( uint32_t ) itr - > matches - > a [ i ] . info ) ;
2013-02-05 13:41:07 +08:00
int64_t xj = itr - > sub - > a [ j ] . info > > 32 < < 32 | ( itr - > len - ( uint32_t ) itr - > sub - > a [ j ] . info ) ;
2013-02-05 05:48:11 +08:00
if ( xi < xj ) {
2013-02-02 03:20:38 +08:00
kv_push ( bwtintv_t , * a , itr - > matches - > a [ i ] ) ;
+ + i ;
2013-02-09 05:56:28 +08:00
} else if ( ( uint32_t ) itr - > sub - > a [ j ] . info - ( itr - > sub - > a [ j ] . info > > 32 ) > = max > > 1 & & ( uint32_t ) itr - > sub - > a [ j ] . info > ori_start ) {
2013-02-02 03:20:38 +08:00
kv_push ( bwtintv_t , * a , itr - > sub - > a [ j ] ) ;
+ + j ;
2013-02-08 08:50:37 +08:00
} else + + j ;
2013-02-02 03:20:38 +08:00
}
for ( ; i < itr - > matches - > n ; + + i ) kv_push ( bwtintv_t , * a , itr - > matches - > a [ i ] ) ;
2013-02-08 08:50:37 +08:00
for ( ; j < itr - > sub - > n ; + + j )
2013-02-09 05:56:28 +08:00
if ( ( uint32_t ) itr - > sub - > a [ j ] . info - ( itr - > sub - > a [ j ] . info > > 32 ) > = max > > 1 & & ( uint32_t ) itr - > sub - > a [ j ] . info > ori_start )
2013-02-08 08:50:37 +08:00
kv_push ( bwtintv_t , * a , itr - > sub - > a [ j ] ) ;
2013-02-02 03:20:38 +08:00
kv_copy ( bwtintv_t , * itr - > matches , * a ) ;
}
2013-02-02 03:38:44 +08:00
return itr - > matches ;
2013-02-01 02:59:48 +08:00
}
2013-02-05 06:23:06 +08:00
/********************************
* Chaining while finding SMEMs *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-25 01:17:29 +08:00
typedef struct {
int64_t rbeg ;
int32_t qbeg , len ;
} mem_seed_t ;
typedef struct {
int n , m ;
int64_t pos ;
mem_seed_t * seeds ;
} mem_chain_t ;
typedef struct { size_t n , m ; mem_chain_t * a ; } mem_chain_v ;
2013-02-01 04:55:22 +08:00
# include "kbtree.h"
2013-02-22 00:42:30 +08:00
# define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
2013-02-08 02:13:43 +08:00
KBTREE_INIT ( chn , mem_chain_t , chain_cmp )
2013-02-01 04:55:22 +08:00
2013-03-05 00:35:23 +08:00
static int test_and_merge ( const mem_opt_t * opt , int64_t l_pac , mem_chain_t * c , const mem_seed_t * p )
2013-02-01 04:55:22 +08:00
{
int64_t qend , rend , x , y ;
2013-02-02 05:39:50 +08:00
const mem_seed_t * last = & c - > seeds [ c - > n - 1 ] ;
2013-02-01 04:55:22 +08:00
qend = last - > qbeg + last - > len ;
rend = last - > rbeg + last - > len ;
2013-02-01 05:39:24 +08:00
if ( p - > qbeg > = c - > seeds [ 0 ] . qbeg & & p - > qbeg + p - > len < = qend & & p - > rbeg > = c - > seeds [ 0 ] . rbeg & & p - > rbeg + p - > len < = rend )
2013-02-01 04:55:22 +08:00
return 1 ; // contained seed; do nothing
2013-03-05 00:35:23 +08:00
if ( ( last - > rbeg < l_pac | | c - > seeds [ 0 ] . rbeg < l_pac ) & & p - > rbeg > = l_pac ) return 0 ; // don't chain if on different strand
2013-02-05 05:48:11 +08:00
x = p - > qbeg - last - > qbeg ; // always non-negtive
2013-02-01 04:55:22 +08:00
y = p - > rbeg - last - > rbeg ;
2013-02-05 05:48:11 +08:00
if ( y > = 0 & & x - y < = opt - > w & & y - x < = opt - > w & & x - last - > len < opt - > max_chain_gap & & y - last - > len < opt - > max_chain_gap ) { // grow the chain
2013-02-01 04:55:22 +08:00
if ( c - > n = = c - > m ) {
c - > m < < = 1 ;
2013-05-02 22:12:01 +08:00
c - > seeds = realloc ( c - > seeds , c - > m * sizeof ( mem_seed_t ) ) ;
2013-02-01 04:55:22 +08:00
}
c - > seeds [ c - > n + + ] = * p ;
return 1 ;
}
2013-02-02 03:38:44 +08:00
return 0 ; // request to add a new chain
2013-02-01 04:55:22 +08:00
}
2013-03-05 00:35:23 +08:00
static void mem_insert_seed ( const mem_opt_t * opt , int64_t l_pac , kbtree_t ( chn ) * tree , smem_i * itr )
2013-02-01 04:55:22 +08:00
{
2013-02-02 03:38:44 +08:00
const bwtintv_v * a ;
2013-02-22 01:52:00 +08:00
int split_len = ( int ) ( opt - > min_seed_len * opt - > split_factor + .499 ) ;
2013-02-26 06:29:35 +08:00
split_len = split_len < itr - > len ? split_len : itr - > len ;
2013-02-22 01:52:00 +08:00
while ( ( a = smem_next ( itr , split_len , opt - > split_width ) ) ! = 0 ) { // to find all SMEM and some internal MEM
2013-02-01 04:55:22 +08:00
int i ;
2013-02-02 03:38:44 +08:00
for ( i = 0 ; i < a - > n ; + + i ) { // go through each SMEM/MEM up to itr->start
bwtintv_t * p = & a - > a [ i ] ;
2013-02-01 04:55:22 +08:00
int slen = ( uint32_t ) p - > info - ( p - > info > > 32 ) ; // seed length
int64_t k ;
2013-02-02 03:38:44 +08:00
if ( slen < opt - > min_seed_len | | p - > x [ 2 ] > opt - > max_occ ) continue ; // ignore if too short or too repetitive
2013-02-01 04:55:22 +08:00
for ( k = 0 ; k < p - > x [ 2 ] ; + + k ) {
2013-02-08 02:13:43 +08:00
mem_chain_t tmp , * lower , * upper ;
2013-02-02 05:39:50 +08:00
mem_seed_t s ;
2013-02-01 04:55:22 +08:00
int to_add = 0 ;
2013-02-02 03:38:44 +08:00
s . rbeg = tmp . pos = bwt_sa ( itr - > bwt , p - > x [ 0 ] + k ) ; // this is the base coordinate in the forward-reverse reference
2013-02-01 05:26:05 +08:00
s . qbeg = p - > info > > 32 ;
s . len = slen ;
2013-12-31 05:05:43 +08:00
if ( bwa_verbose > = 5 ) printf ( " SEED l=%d,qb=%d,rb=%ld \n " , s . len , s . qbeg , ( long ) s . rbeg ) ;
2013-03-05 00:35:23 +08:00
if ( s . rbeg < l_pac & & l_pac < s . rbeg + s . len ) continue ; // bridging forward-reverse boundary; skip
2013-02-01 04:55:22 +08:00
if ( kb_size ( tree ) ) {
2013-02-02 03:38:44 +08:00
kb_intervalp ( chn , tree , & tmp , & lower , & upper ) ; // find the closest chain
2013-03-05 00:35:23 +08:00
if ( ! lower | | ! test_and_merge ( opt , l_pac , lower , & s ) ) to_add = 1 ;
2013-02-01 05:26:05 +08:00
} else to_add = 1 ;
2013-02-02 03:38:44 +08:00
if ( to_add ) { // add the seed as a new chain
2013-02-01 04:55:22 +08:00
tmp . n = 1 ; tmp . m = 4 ;
2013-05-02 22:12:01 +08:00
tmp . seeds = calloc ( tmp . m , sizeof ( mem_seed_t ) ) ;
2013-02-01 05:26:05 +08:00
tmp . seeds [ 0 ] = s ;
2013-02-01 04:55:22 +08:00
kb_putp ( chn , tree , & tmp ) ;
}
}
}
}
}
2013-12-31 05:18:45 +08:00
int mem_chain_weight ( const mem_chain_t * c )
{
int64_t end ;
int j , w = 0 , tmp ;
for ( j = 0 , end = 0 ; j < c - > n ; + + j ) {
const mem_seed_t * s = & c - > seeds [ j ] ;
if ( s - > qbeg > = end ) w + = s - > len ;
else if ( s - > qbeg + s - > len > end ) w + = s - > qbeg + s - > len - end ;
end = end > s - > qbeg + s - > len ? end : s - > qbeg + s - > len ;
}
tmp = w ;
for ( j = 0 , end = 0 ; j < c - > n ; + + j ) {
const mem_seed_t * s = & c - > seeds [ j ] ;
if ( s - > rbeg > = end ) w + = s - > len ;
else if ( s - > rbeg + s - > len > end ) w + = s - > rbeg + s - > len - end ;
end = end > s - > qbeg + s - > len ? end : s - > qbeg + s - > len ;
}
return w < tmp ? w : tmp ;
}
2013-02-08 05:27:11 +08:00
void mem_print_chain ( const bntseq_t * bns , mem_chain_v * chn )
{
int i , j ;
for ( i = 0 ; i < chn - > n ; + + i ) {
mem_chain_t * p = & chn - > a [ i ] ;
2013-12-31 05:18:45 +08:00
err_printf ( " CHAIN(%d) n=%d w=%d " , i , p - > n , mem_chain_weight ( p ) ) ;
2013-02-08 05:27:11 +08:00
for ( j = 0 ; j < p - > n ; + + j ) {
bwtint_t pos ;
int is_rev , ref_id ;
pos = bns_depos ( bns , p - > seeds [ j ] . rbeg , & is_rev ) ;
if ( is_rev ) pos - = p - > seeds [ j ] . len - 1 ;
bns_cnt_ambi ( bns , pos , p - > seeds [ j ] . len , & ref_id ) ;
2013-03-01 17:37:46 +08:00
err_printf ( " \t %d,%d,%ld(%s:%c%ld) " , p - > seeds [ j ] . len , p - > seeds [ j ] . qbeg , ( long ) p - > seeds [ j ] . rbeg , bns - > anns [ ref_id ] . name , " +- " [ is_rev ] , ( long ) ( pos - bns - > anns [ ref_id ] . offset ) + 1 ) ;
2013-02-08 05:27:11 +08:00
}
2013-03-01 17:37:46 +08:00
err_putchar ( ' \n ' ) ;
2013-02-08 05:27:11 +08:00
}
}
2013-03-05 00:35:23 +08:00
mem_chain_v mem_chain ( const mem_opt_t * opt , const bwt_t * bwt , int64_t l_pac , int len , const uint8_t * seq )
2013-02-01 04:55:22 +08:00
{
2013-02-08 02:13:43 +08:00
mem_chain_v chain ;
2013-02-01 04:55:22 +08:00
smem_i * itr ;
kbtree_t ( chn ) * tree ;
2013-02-08 02:13:43 +08:00
kv_init ( chain ) ;
2013-02-01 04:55:22 +08:00
if ( len < opt - > min_seed_len ) return chain ; // if the query is shorter than the seed length, no match
tree = kb_init ( chn , KB_DEFAULT_SIZE ) ;
itr = smem_itr_init ( bwt ) ;
2013-02-02 03:20:38 +08:00
smem_set_query ( itr , len , seq ) ;
2013-03-05 00:35:23 +08:00
mem_insert_seed ( opt , l_pac , tree , itr ) ;
2013-02-01 05:26:05 +08:00
2013-02-08 02:13:43 +08:00
kv_resize ( mem_chain_t , chain , kb_size ( tree ) ) ;
2013-02-01 05:26:05 +08:00
2013-02-08 02:13:43 +08:00
# define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse ( mem_chain_t , tree , traverse_func ) ;
2013-02-01 05:26:05 +08:00
# undef traverse_func
2013-02-01 04:55:22 +08:00
smem_itr_destroy ( itr ) ;
kb_destroy ( chn , tree ) ;
return chain ;
}
2013-02-02 05:39:50 +08:00
2013-02-05 05:48:11 +08:00
/********************
* Filtering chains *
* * * * * * * * * * * * * * * * * * * */
2013-02-05 06:23:06 +08:00
typedef struct {
int beg , end , w ;
void * p , * p2 ;
} flt_aux_t ;
# define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT ( mem_flt , flt_aux_t , flt_lt )
2013-02-08 02:13:43 +08:00
int mem_chain_flt ( const mem_opt_t * opt , int n_chn , mem_chain_t * chains )
2013-02-05 06:23:06 +08:00
{
flt_aux_t * a ;
int i , j , n ;
2013-02-07 02:59:32 +08:00
if ( n_chn < = 1 ) return n_chn ; // no need to filter
2013-05-02 22:12:01 +08:00
a = malloc ( sizeof ( flt_aux_t ) * n_chn ) ;
2013-02-07 02:59:32 +08:00
for ( i = 0 ; i < n_chn ; + + i ) {
2013-02-08 02:13:43 +08:00
mem_chain_t * c = & chains [ i ] ;
2013-12-31 05:18:45 +08:00
int w ;
w = mem_chain_weight ( c ) ;
2013-02-05 06:23:06 +08:00
a [ i ] . beg = c - > seeds [ 0 ] . qbeg ;
a [ i ] . end = c - > seeds [ c - > n - 1 ] . qbeg + c - > seeds [ c - > n - 1 ] . len ;
2013-02-07 02:59:32 +08:00
a [ i ] . w = w ; a [ i ] . p = c ; a [ i ] . p2 = 0 ;
2013-02-05 06:23:06 +08:00
}
2013-02-07 02:59:32 +08:00
ks_introsort ( mem_flt , n_chn , a ) ;
2013-02-06 10:58:33 +08:00
{ // reorder chains such that the best chain appears first
2013-02-08 02:13:43 +08:00
mem_chain_t * swap ;
2013-05-02 22:12:01 +08:00
swap = malloc ( sizeof ( mem_chain_t ) * n_chn ) ;
2013-02-07 02:59:32 +08:00
for ( i = 0 ; i < n_chn ; + + i ) {
2013-02-08 02:13:43 +08:00
swap [ i ] = * ( ( mem_chain_t * ) a [ i ] . p ) ;
2013-02-07 02:59:32 +08:00
a [ i ] . p = & chains [ i ] ; // as we will memcpy() below, a[i].p is changed
2013-02-06 10:58:33 +08:00
}
2013-02-08 02:13:43 +08:00
memcpy ( chains , swap , sizeof ( mem_chain_t ) * n_chn ) ;
2013-02-06 10:58:33 +08:00
free ( swap ) ;
}
2013-02-07 02:59:32 +08:00
for ( i = 1 , n = 1 ; i < n_chn ; + + i ) {
2013-02-05 06:23:06 +08:00
for ( j = 0 ; j < n ; + + j ) {
int b_max = a [ j ] . beg > a [ i ] . beg ? a [ j ] . beg : a [ i ] . beg ;
2013-02-05 13:17:20 +08:00
int e_min = a [ j ] . end < a [ i ] . end ? a [ j ] . end : a [ i ] . end ;
2013-02-05 06:23:06 +08:00
if ( e_min > b_max ) { // have overlap
int min_l = a [ i ] . end - a [ i ] . beg < a [ j ] . end - a [ j ] . beg ? a [ i ] . end - a [ i ] . beg : a [ j ] . end - a [ j ] . beg ;
if ( e_min - b_max > = min_l * opt - > mask_level ) { // significant overlap
if ( a [ j ] . p2 = = 0 ) a [ j ] . p2 = a [ i ] . p ;
2013-02-22 01:25:20 +08:00
if ( a [ i ] . w < a [ j ] . w * opt - > chain_drop_ratio & & a [ j ] . w - a [ i ] . w > = opt - > min_seed_len < < 1 )
2013-02-05 06:23:06 +08:00
break ;
}
}
}
if ( j = = n ) a [ n + + ] = a [ i ] ; // if have no significant overlap with better chains, keep it.
}
2013-02-05 13:17:20 +08:00
for ( i = 0 ; i < n ; + + i ) { // mark chains to be kept
2013-02-08 02:13:43 +08:00
mem_chain_t * c = ( mem_chain_t * ) a [ i ] . p ;
2013-02-05 13:17:20 +08:00
if ( c - > n > 0 ) c - > n = - c - > n ;
2013-02-08 02:13:43 +08:00
c = ( mem_chain_t * ) a [ i ] . p2 ;
2013-02-05 13:17:20 +08:00
if ( c & & c - > n > 0 ) c - > n = - c - > n ;
}
2013-02-05 06:23:06 +08:00
free ( a ) ;
2013-02-07 02:59:32 +08:00
for ( i = 0 ; i < n_chn ; + + i ) { // free discarded chains
2013-02-08 02:13:43 +08:00
mem_chain_t * c = & chains [ i ] ;
2013-02-05 13:17:20 +08:00
if ( c - > n > = 0 ) {
free ( c - > seeds ) ;
c - > n = c - > m = 0 ;
} else c - > n = - c - > n ;
}
2013-02-07 02:59:32 +08:00
for ( i = n = 0 ; i < n_chn ; + + i ) { // squeeze out discarded chains
if ( chains [ i ] . n > 0 ) {
if ( n ! = i ) chains [ n + + ] = chains [ i ] ;
2013-02-05 13:17:20 +08:00
else + + n ;
}
}
2013-02-07 02:59:32 +08:00
return n ;
2013-02-05 06:23:06 +08:00
}
2013-02-11 01:24:33 +08:00
/******************************
* De - overlap single - end hits *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-07 03:38:40 +08:00
2013-09-09 23:36:50 +08:00
# define alnreg_slt2(a, b) ((a).re < (b).re)
KSORT_INIT ( mem_ars2 , mem_alnreg_t , alnreg_slt2 )
2013-02-11 01:24:33 +08:00
# define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT ( mem_ars , mem_alnreg_t , alnreg_slt )
2013-11-23 22:36:26 +08:00
# define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
KSORT_INIT ( mem_ars_hash , mem_alnreg_t , alnreg_hlt )
2013-09-09 23:36:50 +08:00
int mem_sort_and_dedup ( int n , mem_alnreg_t * a , float mask_level_redun )
2013-02-11 01:24:33 +08:00
{
2013-09-09 23:36:50 +08:00
int m , i , j ;
2013-02-07 03:38:40 +08:00
if ( n < = 1 ) return n ;
2013-09-09 23:36:50 +08:00
ks_introsort ( mem_ars2 , n , a ) ;
for ( i = 1 ; i < n ; + + i ) {
mem_alnreg_t * p = & a [ i ] ;
if ( p - > rb > = a [ i - 1 ] . re ) continue ;
for ( j = i - 1 ; j > = 0 & & p - > rb < a [ j ] . re ; - - j ) {
mem_alnreg_t * q = & a [ j ] ;
int64_t or , oq , mr , mq ;
if ( q - > qe = = q - > qb ) continue ; // a[j] has been excluded
or = q - > re - p - > rb ; // overlap length on the reference
oq = q - > qb < p - > qb ? q - > qe - p - > qb : p - > qe - q - > qb ; // overlap length on the query
mr = q - > re - q - > rb < p - > re - p - > rb ? q - > re - q - > rb : p - > re - p - > rb ; // min ref len in alignment
mq = q - > qe - q - > qb < p - > qe - p - > qb ? q - > qe - q - > qb : p - > qe - p - > qb ; // min qry len in alignment
if ( or > mask_level_redun * mr & & oq > mask_level_redun * mq ) { // one of the hits is redundant
2013-09-10 04:57:55 +08:00
if ( p - > score < q - > score ) {
p - > qe = p - > qb ;
break ;
} else q - > qe = q - > qb ;
2013-09-09 23:36:50 +08:00
}
}
}
2013-09-10 04:57:55 +08:00
for ( i = 0 , m = 0 ; i < n ; + + i ) // exclude identical hits
if ( a [ i ] . qe > a [ i ] . qb ) {
if ( m ! = i ) a [ m + + ] = a [ i ] ;
else + + m ;
}
n = m ;
2013-02-11 01:24:33 +08:00
ks_introsort ( mem_ars , n , a ) ;
2013-02-09 03:18:39 +08:00
for ( i = 1 ; i < n ; + + i ) { // mark identical hits
if ( a [ i ] . score = = a [ i - 1 ] . score & & a [ i ] . rb = = a [ i - 1 ] . rb & & a [ i ] . qb = = a [ i - 1 ] . qb )
2013-02-09 06:55:35 +08:00
a [ i ] . qe = a [ i ] . qb ;
2013-02-09 03:18:39 +08:00
}
for ( i = 1 , m = 1 ; i < n ; + + i ) // exclude identical hits
2013-02-21 09:26:57 +08:00
if ( a [ i ] . qe > a [ i ] . qb ) {
if ( m ! = i ) a [ m + + ] = a [ i ] ;
else + + m ;
}
2013-02-11 01:24:33 +08:00
return m ;
}
2013-11-23 22:36:26 +08:00
void mem_mark_primary_se ( const mem_opt_t * opt , int n , mem_alnreg_t * a , int64_t id ) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
2013-02-11 01:24:33 +08:00
{ // similar to the loop in mem_chain_flt()
2013-02-13 04:34:44 +08:00
int i , k , tmp ;
kvec_t ( int ) z ;
if ( n = = 0 ) return ;
kv_init ( z ) ;
2013-11-23 22:36:26 +08:00
for ( i = 0 ; i < n ; + + i ) a [ i ] . sub = 0 , a [ i ] . secondary = - 1 , a [ i ] . hash = hash_64 ( id + i ) ;
ks_introsort ( mem_ars_hash , n , a ) ;
2013-02-09 06:55:35 +08:00
tmp = opt - > a + opt - > b > opt - > q + opt - > r ? opt - > a + opt - > b : opt - > q + opt - > r ;
2013-02-13 04:34:44 +08:00
kv_push ( int , z , 0 ) ;
for ( i = 1 ; i < n ; + + i ) {
for ( k = 0 ; k < z . n ; + + k ) {
int j = z . a [ k ] ;
2013-02-07 03:38:40 +08:00
int b_max = a [ j ] . qb > a [ i ] . qb ? a [ j ] . qb : a [ i ] . qb ;
int e_min = a [ j ] . qe < a [ i ] . qe ? a [ j ] . qe : a [ i ] . qe ;
if ( e_min > b_max ) { // have overlap
int min_l = a [ i ] . qe - a [ i ] . qb < a [ j ] . qe - a [ j ] . qb ? a [ i ] . qe - a [ i ] . qb : a [ j ] . qe - a [ j ] . qb ;
if ( e_min - b_max > = min_l * opt - > mask_level ) { // significant overlap
if ( a [ j ] . sub = = 0 ) a [ j ] . sub = a [ i ] . score ;
2013-02-09 06:55:35 +08:00
if ( a [ j ] . score - a [ i ] . score < = tmp ) + + a [ j ] . sub_n ;
2013-02-07 03:38:40 +08:00
break ;
}
}
}
2013-02-13 04:34:44 +08:00
if ( k = = z . n ) kv_push ( int , z , i ) ;
2013-02-13 04:52:23 +08:00
else a [ i ] . secondary = z . a [ k ] ;
2013-02-07 03:38:40 +08:00
}
2013-02-13 04:34:44 +08:00
free ( z . a ) ;
2013-02-07 03:38:40 +08:00
}
2013-02-05 05:48:11 +08:00
/****************************************
* Construct the alignment from a chain *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-03-05 03:43:49 +08:00
/* mem_chain2aln() vs mem_chain2aln_short()
*
* mem_chain2aln ( ) covers all the functionality of mem_chain2aln_short ( ) .
* However , it may waste time on extracting the reference sequences given a
* very long query . mem_chain2aln_short ( ) is faster for very short chains in a
* long query . It may fail when the matches are long or reach the end of the
* query . In this case , mem_chain2aln ( ) will be called again .
* mem_chain2aln_short ( ) is almost never used for short - read alignment .
*/
# define MEM_SHORT_EXT 50
# define MEM_SHORT_LEN 200
2013-03-05 13:57:16 +08:00
# define MAX_BAND_TRY 2
2013-03-05 03:43:49 +08:00
int mem_chain2aln_short ( const mem_opt_t * opt , int64_t l_pac , const uint8_t * pac , int l_query , const uint8_t * query , const mem_chain_t * c , mem_alnreg_v * av )
{
int i , qb , qe , xtra ;
int64_t rb , re , rlen ;
uint8_t * rseq = 0 ;
mem_alnreg_t a ;
kswr_t x ;
if ( c - > n = = 0 ) return - 1 ;
qb = l_query ; qe = 0 ;
rb = l_pac < < 1 ; re = 0 ;
memset ( & a , 0 , sizeof ( mem_alnreg_t ) ) ;
for ( i = 0 ; i < c - > n ; + + i ) {
const mem_seed_t * s = & c - > seeds [ i ] ;
qb = qb < s - > qbeg ? qb : s - > qbeg ;
qe = qe > s - > qbeg + s - > len ? qe : s - > qbeg + s - > len ;
rb = rb < s - > rbeg ? rb : s - > rbeg ;
re = re > s - > rbeg + s - > len ? re : s - > rbeg + s - > len ;
a . seedcov + = s - > len ;
}
qb - = MEM_SHORT_EXT ; qe + = MEM_SHORT_EXT ;
if ( qb < = 10 | | qe > = l_query - 10 ) return 1 ; // because ksw_align() does not support end-to-end alignment
rb - = MEM_SHORT_EXT ; re + = MEM_SHORT_EXT ;
rb = rb > 0 ? rb : 0 ;
re = re < l_pac < < 1 ? re : l_pac < < 1 ;
if ( rb < l_pac & & l_pac < re ) {
if ( c - > seeds [ 0 ] . rbeg < l_pac ) re = l_pac ;
else rb = l_pac ;
}
if ( ( re - rb ) - ( qe - qb ) > MEM_SHORT_EXT | | ( qe - qb ) - ( re - rb ) > MEM_SHORT_EXT ) return 1 ;
if ( qe - qb > = opt - > w * 4 | | re - rb > = opt - > w * 4 ) return 1 ;
if ( qe - qb > = MEM_SHORT_LEN | | re - rb > = MEM_SHORT_LEN ) return 1 ;
rseq = bns_get_seq ( l_pac , pac , rb , re , & rlen ) ;
assert ( rlen = = re - rb ) ;
xtra = KSW_XSUBO | KSW_XSTART | ( ( qe - qb ) * opt - > a < 250 ? KSW_XBYTE : 0 ) | ( opt - > min_seed_len * opt - > a ) ;
x = ksw_align ( qe - qb , ( uint8_t * ) query + qb , re - rb , rseq , 5 , opt - > mat , opt - > q , opt - > r , xtra , 0 ) ;
free ( rseq ) ;
if ( x . tb < MEM_SHORT_EXT > > 1 | | x . te > re - rb - ( MEM_SHORT_EXT > > 1 ) ) return 1 ;
a . rb = rb + x . tb ; a . re = rb + x . te + 1 ;
a . qb = qb + x . qb ; a . qe = qb + x . qe + 1 ;
a . score = x . score ;
a . csub = x . score2 ;
kv_push ( mem_alnreg_t , * av , a ) ;
2013-09-09 23:36:50 +08:00
if ( bwa_verbose > = 4 ) printf ( " chain2aln(short): [%d,%d) <=> [%ld,%ld) \n " , a . qb , a . qe , ( long ) a . rb , ( long ) a . re ) ;
2013-03-05 03:43:49 +08:00
return 0 ;
}
2013-02-05 01:37:38 +08:00
static inline int cal_max_gap ( const mem_opt_t * opt , int qlen )
{
int l = ( int ) ( ( double ) ( qlen * opt - > a - opt - > q ) / opt - > r + 1. ) ;
2013-02-27 13:29:11 +08:00
l = l > 1 ? l : 1 ;
return l < opt - > w < < 1 ? l : opt - > w < < 1 ;
2013-02-05 01:37:38 +08:00
}
2013-02-22 03:58:51 +08:00
void mem_chain2aln ( const mem_opt_t * opt , int64_t l_pac , const uint8_t * pac , int l_query , const uint8_t * query , const mem_chain_t * c , mem_alnreg_v * av )
2013-03-05 03:43:49 +08:00
{
2013-03-05 06:29:07 +08:00
int i , k , max_off [ 2 ] , aw [ 2 ] ; // aw: actual bandwidth used in extension
2013-02-24 02:26:50 +08:00
int64_t rlen , rmax [ 2 ] , tmp , max = 0 ;
2013-02-05 05:08:00 +08:00
const mem_seed_t * s ;
2013-02-05 01:37:38 +08:00
uint8_t * rseq = 0 ;
2013-02-27 11:55:44 +08:00
uint64_t * srt ;
2013-02-05 04:02:56 +08:00
2013-02-27 11:55:44 +08:00
if ( c - > n = = 0 ) return ;
2013-02-05 01:37:38 +08:00
// get the max possible span
2013-02-05 05:08:00 +08:00
rmax [ 0 ] = l_pac < < 1 ; rmax [ 1 ] = 0 ;
for ( i = 0 ; i < c - > n ; + + i ) {
int64_t b , e ;
const mem_seed_t * t = & c - > seeds [ i ] ;
b = t - > rbeg - ( t - > qbeg + cal_max_gap ( opt , t - > qbeg ) ) ;
e = t - > rbeg + t - > len + ( ( l_query - t - > qbeg - t - > len ) + cal_max_gap ( opt , l_query - t - > qbeg - t - > len ) ) ;
rmax [ 0 ] = rmax [ 0 ] < b ? rmax [ 0 ] : b ;
rmax [ 1 ] = rmax [ 1 ] > e ? rmax [ 1 ] : e ;
2013-02-24 02:26:50 +08:00
if ( t - > len > max ) max = t - > len ;
2013-02-05 05:08:00 +08:00
}
2013-02-27 13:37:17 +08:00
rmax [ 0 ] = rmax [ 0 ] > 0 ? rmax [ 0 ] : 0 ;
rmax [ 1 ] = rmax [ 1 ] < l_pac < < 1 ? rmax [ 1 ] : l_pac < < 1 ;
if ( rmax [ 0 ] < l_pac & & l_pac < rmax [ 1 ] ) { // crossing the forward-reverse boundary; then choose one side
2013-03-05 00:35:23 +08:00
if ( c - > seeds [ 0 ] . rbeg < l_pac ) rmax [ 1 ] = l_pac ; // this works because all seeds are guaranteed to be on the same strand
2013-02-27 13:37:17 +08:00
else rmax [ 0 ] = l_pac ;
}
2013-02-05 01:37:38 +08:00
// retrieve the reference sequence
rseq = bns_get_seq ( l_pac , pac , rmax [ 0 ] , rmax [ 1 ] , & rlen ) ;
2013-03-05 00:35:23 +08:00
assert ( rlen = = rmax [ 1 ] - rmax [ 0 ] ) ;
2013-02-05 01:37:38 +08:00
2013-05-02 22:12:01 +08:00
srt = malloc ( c - > n * 8 ) ;
2013-02-27 11:55:44 +08:00
for ( i = 0 ; i < c - > n ; + + i )
srt [ i ] = ( uint64_t ) c - > seeds [ i ] . len < < 32 | i ;
ks_introsort_64 ( c - > n , srt ) ;
for ( k = c - > n - 1 ; k > = 0 ; - - k ) {
2013-02-22 03:58:51 +08:00
mem_alnreg_t * a ;
2013-02-27 11:55:44 +08:00
s = & c - > seeds [ ( uint32_t ) srt [ k ] ] ;
for ( i = 0 ; i < av - > n ; + + i ) { // test whether extension has been made before
mem_alnreg_t * p = & av - > a [ i ] ;
int64_t rd ;
2013-02-27 13:29:11 +08:00
int qd , w , max_gap ;
if ( s - > rbeg < p - > rb | | s - > rbeg + s - > len > p - > re | | s - > qbeg < p - > qb | | s - > qbeg + s - > len > p - > qe ) continue ; // not fully contained
// qd: distance ahead of the seed on query; rd: on reference
qd = s - > qbeg - p - > qb ; rd = s - > rbeg - p - > rb ;
max_gap = cal_max_gap ( opt , qd < rd ? qd : rd ) ; // the maximal gap allowed in regions ahead of the seed
w = max_gap < opt - > w ? max_gap : opt - > w ; // bounded by the band width
2013-02-27 11:55:44 +08:00
if ( qd - rd < w & & rd - qd < w ) break ; // the seed is "around" a previous hit
2013-02-27 13:29:11 +08:00
// similar to the previous four lines, but this time we look at the region behind
qd = p - > qe - ( s - > qbeg + s - > len ) ; rd = p - > re - ( s - > rbeg + s - > len ) ;
max_gap = cal_max_gap ( opt , qd < rd ? qd : rd ) ;
w = max_gap < opt - > w ? max_gap : opt - > w ;
if ( qd - rd < w & & rd - qd < w ) break ;
2013-02-27 11:55:44 +08:00
}
2013-04-04 12:43:43 +08:00
if ( i < av - > n ) { // the seed is (almost) contained in an existing alignment
for ( i = k + 1 ; i < c - > n ; + + i ) { // check overlapping seeds in the same chain
const mem_seed_t * t ;
if ( srt [ i ] = = 0 ) continue ;
t = & c - > seeds [ ( uint32_t ) srt [ i ] ] ;
if ( t - > len < s - > len * .95 ) continue ; // only check overlapping if t is long enough; TODO: more efficient by early stopping
2013-04-11 00:18:56 +08:00
if ( s - > qbeg < = t - > qbeg & & s - > qbeg + s - > len - t - > qbeg > = s - > len > > 2 & & t - > qbeg - s - > qbeg ! = t - > rbeg - s - > rbeg ) break ;
if ( t - > qbeg < = s - > qbeg & & t - > qbeg + t - > len - s - > qbeg > = s - > len > > 2 & & s - > qbeg - t - > qbeg ! = s - > rbeg - t - > rbeg ) break ;
2013-04-04 12:43:43 +08:00
}
if ( i = = c - > n ) { // no overlapping seeds; then skip extension
srt [ k ] = 0 ; // mark that seed extension has not been performed
continue ;
}
}
2013-02-27 11:55:44 +08:00
2013-02-22 03:58:51 +08:00
a = kv_pushp ( mem_alnreg_t , * av ) ;
2013-02-08 10:20:36 +08:00
memset ( a , 0 , sizeof ( mem_alnreg_t ) ) ;
2013-03-05 06:29:07 +08:00
a - > w = aw [ 0 ] = aw [ 1 ] = opt - > w ;
2013-03-16 09:26:37 +08:00
a - > score = a - > truesc = - 1 ;
2013-02-27 11:55:44 +08:00
2013-09-09 23:36:50 +08:00
if ( bwa_verbose > = 4 ) err_printf ( " Extending from seed [%ld,%ld,%ld] \n " , ( long ) s - > len , ( long ) s - > qbeg , ( long ) s - > rbeg ) ;
2013-02-08 10:20:36 +08:00
if ( s - > qbeg ) { // left extension
uint8_t * rs , * qs ;
2013-02-28 10:13:39 +08:00
int qle , tle , gtle , gscore ;
2013-05-02 22:12:01 +08:00
qs = malloc ( s - > qbeg ) ;
2013-02-08 10:20:36 +08:00
for ( i = 0 ; i < s - > qbeg ; + + i ) qs [ i ] = query [ s - > qbeg - 1 - i ] ;
tmp = s - > rbeg - rmax [ 0 ] ;
2013-05-02 22:12:01 +08:00
rs = malloc ( tmp ) ;
2013-02-08 10:20:36 +08:00
for ( i = 0 ; i < tmp ; + + i ) rs [ i ] = rseq [ tmp - 1 - i ] ;
2013-03-05 13:57:16 +08:00
for ( i = 0 ; i < MAX_BAND_TRY ; + + i ) {
int prev = a - > score ;
aw [ 0 ] = opt - > w < < i ;
2013-08-29 03:59:05 +08:00
a - > score = ksw_extend ( s - > qbeg , qs , tmp , rs , 5 , opt - > mat , opt - > q , opt - > r , aw [ 0 ] , opt - > pen_clip5 , opt - > zdrop , s - > len * opt - > a , & qle , & tle , & gtle , & gscore , & max_off [ 0 ] ) ;
2013-03-15 23:59:05 +08:00
if ( bwa_verbose > = 4 ) { printf ( " L \t %d < %d; w=%d; max_off=%d \n " , prev , a - > score , aw [ 0 ] , max_off [ 0 ] ) ; fflush ( stdout ) ; }
2013-03-05 13:57:16 +08:00
if ( a - > score = = prev | | max_off [ 0 ] < ( aw [ 0 ] > > 1 ) + ( aw [ 0 ] > > 2 ) ) break ;
2013-03-05 06:29:07 +08:00
}
2013-02-28 10:13:39 +08:00
// check whether we prefer to reach the end of the query
2013-08-29 03:59:05 +08:00
if ( gscore < = 0 | | gscore < = a - > score - opt - > pen_clip5 ) { // local extension
2013-03-16 09:26:37 +08:00
a - > qb = s - > qbeg - qle , a - > rb = s - > rbeg - tle ;
a - > truesc = a - > score ;
} else { // to-end extension
a - > qb = 0 , a - > rb = s - > rbeg - gtle ;
a - > truesc = gscore ;
}
2013-02-08 10:20:36 +08:00
free ( qs ) ; free ( rs ) ;
2013-03-16 09:26:37 +08:00
} else a - > score = a - > truesc = s - > len * opt - > a , a - > qb = 0 , a - > rb = s - > rbeg ;
2013-02-08 10:20:36 +08:00
2013-02-27 11:55:44 +08:00
if ( s - > qbeg + s - > len ! = l_query ) { // right extension
2013-03-05 13:57:16 +08:00
int qle , tle , qe , re , gtle , gscore , sc0 = a - > score ;
2013-02-08 10:20:36 +08:00
qe = s - > qbeg + s - > len ;
re = s - > rbeg + s - > len - rmax [ 0 ] ;
2013-03-05 00:35:23 +08:00
assert ( re > = 0 ) ;
2013-03-05 13:57:16 +08:00
for ( i = 0 ; i < MAX_BAND_TRY ; + + i ) {
int prev = a - > score ;
aw [ 1 ] = opt - > w < < i ;
2013-08-29 03:59:05 +08:00
a - > score = ksw_extend ( l_query - qe , query + qe , rmax [ 1 ] - rmax [ 0 ] - re , rseq + re , 5 , opt - > mat , opt - > q , opt - > r , aw [ 1 ] , opt - > pen_clip3 , opt - > zdrop , sc0 , & qle , & tle , & gtle , & gscore , & max_off [ 1 ] ) ;
2013-03-15 23:59:05 +08:00
if ( bwa_verbose > = 4 ) { printf ( " R \t %d < %d; w=%d; max_off=%d \n " , prev , a - > score , aw [ 1 ] , max_off [ 1 ] ) ; fflush ( stdout ) ; }
2013-03-05 13:57:16 +08:00
if ( a - > score = = prev | | max_off [ 1 ] < ( aw [ 1 ] > > 1 ) + ( aw [ 1 ] > > 2 ) ) break ;
2013-03-05 06:29:07 +08:00
}
2013-02-28 10:13:39 +08:00
// similar to the above
2013-08-29 03:59:05 +08:00
if ( gscore < = 0 | | gscore < = a - > score - opt - > pen_clip3 ) { // local extension
2013-03-16 09:26:37 +08:00
a - > qe = qe + qle , a - > re = rmax [ 0 ] + re + tle ;
a - > truesc + = a - > score - sc0 ;
} else { // to-end extension
a - > qe = l_query , a - > re = rmax [ 0 ] + re + gtle ;
a - > truesc + = gscore - sc0 ;
}
2013-02-08 10:20:36 +08:00
} else a - > qe = l_query , a - > re = s - > rbeg + s - > len ;
2013-03-05 06:29:07 +08:00
if ( bwa_verbose > = 4 ) { printf ( " [%d] \t aw={%d,%d} \t score=%d \t [%d,%d) <=> [%ld,%ld) \n " , k , aw [ 0 ] , aw [ 1 ] , a - > score , a - > qb , a - > qe , ( long ) a - > rb , ( long ) a - > re ) ; fflush ( stdout ) ; }
2013-02-27 11:55:44 +08:00
2013-02-22 03:58:51 +08:00
// compute seedcov
for ( i = 0 , a - > seedcov = 0 ; i < c - > n ; + + i ) {
const mem_seed_t * t = & c - > seeds [ i ] ;
if ( t - > qbeg > = a - > qb & & t - > qbeg + t - > len < = a - > qe & & t - > rbeg > = a - > rb & & t - > rbeg + t - > len < = a - > re ) // seed fully contained
a - > seedcov + = t - > len ; // this is not very accurate, but for approx. mapQ, this is good enough
}
2013-03-05 06:29:07 +08:00
a - > w = aw [ 0 ] > aw [ 1 ] ? aw [ 0 ] : aw [ 1 ] ;
2013-02-05 04:09:47 +08:00
}
2013-02-27 11:55:44 +08:00
free ( srt ) ; free ( rseq ) ;
2013-02-02 05:39:50 +08:00
}
2013-02-06 10:49:19 +08:00
2013-02-12 04:29:03 +08:00
/*****************************
* Basic hit - > SAM conversion *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-28 12:59:50 +08:00
static inline int infer_bw ( int l1 , int l2 , int score , int a , int q , int r )
{
int w ;
2013-03-15 23:59:05 +08:00
if ( l1 = = l2 & & l1 * a - score < ( q + r - a ) < < 1 ) return 0 ; // to get equal alignment length, we need at least two gaps
2013-02-28 12:59:50 +08:00
w = ( ( double ) ( ( l1 < l2 ? l1 : l2 ) * a - score - q ) / r + 1. ) ;
if ( w < abs ( l1 - l2 ) ) w = abs ( l1 - l2 ) ;
return w ;
}
2013-03-12 09:35:32 +08:00
static inline int get_rlen ( int n_cigar , const uint32_t * cigar )
2013-02-12 04:29:03 +08:00
{
2013-03-12 09:35:32 +08:00
int k , l ;
for ( k = l = 0 ; k < n_cigar ; + + k ) {
int op = cigar [ k ] & 0xf ;
if ( op = = 0 | | op = = 2 )
l + = cigar [ k ] > > 4 ;
2013-02-23 05:38:48 +08:00
}
2013-03-12 09:35:32 +08:00
return l ;
}
2013-03-12 11:01:51 +08:00
void mem_aln2sam ( const bntseq_t * bns , kstring_t * str , bseq1_t * s , int n , const mem_aln_t * list , int which , const mem_aln_t * m_ )
2013-03-12 09:25:17 +08:00
{
2013-03-12 11:01:51 +08:00
int i ;
mem_aln_t ptmp = list [ which ] , * p = & ptmp , mtmp , * m = 0 ; // make a copy of the alignment to convert
2013-03-12 09:25:17 +08:00
2013-03-12 11:01:51 +08:00
if ( m_ ) mtmp = * m_ , m = & mtmp ;
2013-03-12 09:25:17 +08:00
// set flag
p - > flag | = m ? 0x1 : 0 ; // is paired in sequencing
2013-03-12 11:01:51 +08:00
p - > flag | = p - > rid < 0 ? 0x4 : 0 ; // is mapped
p - > flag | = m & & m - > rid < 0 ? 0x8 : 0 ; // is mate mapped
if ( p - > rid < 0 & & m & & m - > rid > = 0 ) // copy mate to alignment
p - > rid = m - > rid , p - > pos = m - > pos , p - > is_rev = m - > is_rev , p - > n_cigar = 0 ;
if ( m & & m - > rid < 0 & & p - > rid > = 0 ) // copy alignment to mate
m - > rid = p - > rid , m - > pos = p - > pos , m - > is_rev = p - > is_rev , m - > n_cigar = 0 ;
2013-03-12 09:25:17 +08:00
p - > flag | = p - > is_rev ? 0x10 : 0 ; // is on the reverse strand
p - > flag | = m & & m - > is_rev ? 0x20 : 0 ; // is mate on the reverse strand
// print up to CIGAR
kputs ( s - > name , str ) ; kputc ( ' \t ' , str ) ; // QNAME
2013-03-12 09:55:52 +08:00
kputw ( ( p - > flag & 0xffff ) | ( p - > flag & 0x10000 ? 0x100 : 0 ) , str ) ; kputc ( ' \t ' , str ) ; // FLAG
2013-03-12 09:25:17 +08:00
if ( p - > rid > = 0 ) { // with coordinate
kputs ( bns - > anns [ p - > rid ] . name , str ) ; kputc ( ' \t ' , str ) ; // RNAME
kputl ( p - > pos + 1 , str ) ; kputc ( ' \t ' , str ) ; // POS
kputw ( p - > mapq , str ) ; kputc ( ' \t ' , str ) ; // MAPQ
if ( p - > n_cigar ) { // aligned
for ( i = 0 ; i < p - > n_cigar ; + + i ) {
2013-05-23 07:45:16 +08:00
int c = p - > cigar [ i ] & 0xf ;
if ( c = = 3 | | c = = 4 ) c = which ? 4 : 3 ; // use hard clipping for supplementary alignments
kputw ( p - > cigar [ i ] > > 4 , str ) ; kputc ( " MIDSH " [ c ] , str ) ;
2013-02-12 04:29:03 +08:00
}
2013-03-12 09:25:17 +08:00
} else kputc ( ' * ' , str ) ; // having a coordinate but unaligned (e.g. when copy_mate is true)
} else kputsn ( " * \t 0 \t 0 \t * " , 7 , str ) ; // without coordinte
kputc ( ' \t ' , str ) ;
// print the mate position if applicable
2013-03-12 11:01:51 +08:00
if ( m & & m - > rid > = 0 ) {
2013-03-12 09:25:17 +08:00
if ( p - > rid = = m - > rid ) kputc ( ' = ' , str ) ;
else kputs ( bns - > anns [ m - > rid ] . name , str ) ;
2013-02-23 05:38:48 +08:00
kputc ( ' \t ' , str ) ;
2013-03-12 11:01:51 +08:00
kputl ( m - > pos + 1 , str ) ; kputc ( ' \t ' , str ) ;
2013-03-12 09:25:17 +08:00
if ( p - > rid = = m - > rid ) {
2013-03-12 12:14:36 +08:00
int64_t p0 = p - > pos + ( p - > is_rev ? get_rlen ( p - > n_cigar , p - > cigar ) - 1 : 0 ) ;
int64_t p1 = m - > pos + ( m - > is_rev ? get_rlen ( m - > n_cigar , m - > cigar ) - 1 : 0 ) ;
if ( m - > n_cigar = = 0 | | p - > n_cigar = = 0 ) kputc ( ' 0 ' , str ) ;
else kputl ( - ( p0 - p1 + ( p0 > p1 ? 1 : p0 < p1 ? - 1 : 0 ) ) , str ) ;
2013-03-12 09:25:17 +08:00
} else kputc ( ' 0 ' , str ) ;
2013-03-12 11:01:51 +08:00
} else kputsn ( " * \t 0 \t 0 " , 5 , str ) ;
kputc ( ' \t ' , str ) ;
2013-03-12 09:25:17 +08:00
// print SEQ and QUAL
if ( p - > flag & 0x100 ) { // for secondary alignments, don't write SEQ and QUAL
2013-02-25 02:23:43 +08:00
kputsn ( " * \t * " , 3 , str ) ;
2013-03-12 09:25:17 +08:00
} else if ( ! p - > is_rev ) { // the forward strand
2013-02-12 04:29:03 +08:00
int i , qb = 0 , qe = s - > l_seq ;
2013-03-12 09:25:17 +08:00
if ( p - > n_cigar ) {
2013-05-23 07:45:16 +08:00
if ( which & & ( ( p - > cigar [ 0 ] & 0xf ) = = 4 | | ( p - > cigar [ 0 ] & 0xf ) = = 3 ) ) qb + = p - > cigar [ 0 ] > > 4 ;
if ( which & & ( ( p - > cigar [ p - > n_cigar - 1 ] & 0xf ) = = 4 | | ( p - > cigar [ p - > n_cigar - 1 ] & 0xf ) = = 3 ) ) qe - = p - > cigar [ p - > n_cigar - 1 ] > > 4 ;
2013-03-12 09:25:17 +08:00
}
2013-02-12 04:29:03 +08:00
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qb ; i < qe ; + + i ) str - > s [ str - > l + + ] = " ACGTN " [ ( int ) s - > seq [ i ] ] ;
kputc ( ' \t ' , str ) ;
if ( s - > qual ) { // printf qual
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qb ; i < qe ; + + i ) str - > s [ str - > l + + ] = s - > qual [ i ] ;
str - > s [ str - > l ] = 0 ;
} else kputc ( ' * ' , str ) ;
} else { // the reverse strand
int i , qb = 0 , qe = s - > l_seq ;
2013-03-12 09:25:17 +08:00
if ( p - > n_cigar ) {
2013-05-24 00:45:14 +08:00
if ( which & & ( ( p - > cigar [ 0 ] & 0xf ) = = 4 | | ( p - > cigar [ 0 ] & 0xf ) = = 3 ) ) qe - = p - > cigar [ 0 ] > > 4 ;
if ( which & & ( ( p - > cigar [ p - > n_cigar - 1 ] & 0xf ) = = 4 | | ( p - > cigar [ p - > n_cigar - 1 ] & 0xf ) = = 3 ) ) qb + = p - > cigar [ p - > n_cigar - 1 ] > > 4 ;
2013-03-12 09:25:17 +08:00
}
2013-02-12 04:29:03 +08:00
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qe - 1 ; i > = qb ; - - i ) str - > s [ str - > l + + ] = " TGCAN " [ ( int ) s - > seq [ i ] ] ;
kputc ( ' \t ' , str ) ;
if ( s - > qual ) { // printf qual
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qe - 1 ; i > = qb ; - - i ) str - > s [ str - > l + + ] = s - > qual [ i ] ;
str - > s [ str - > l ] = 0 ;
} else kputc ( ' * ' , str ) ;
}
2013-03-12 09:25:17 +08:00
// print optional tags
2014-01-30 01:05:11 +08:00
if ( p - > n_cigar ) {
kputsn ( " \t NM:i: " , 6 , str ) ; kputw ( p - > NM , str ) ;
kputsn ( " \t MD:Z: " , 6 , str ) ; kputs ( ( char * ) ( p - > cigar + p - > n_cigar ) , str ) ;
}
2013-02-23 05:38:48 +08:00
if ( p - > score > = 0 ) { kputsn ( " \t AS:i: " , 6 , str ) ; kputw ( p - > score , str ) ; }
if ( p - > sub > = 0 ) { kputsn ( " \t XS:i: " , 6 , str ) ; kputw ( p - > sub , str ) ; }
2013-02-27 02:03:35 +08:00
if ( bwa_rg_id [ 0 ] ) { kputsn ( " \t RG:Z: " , 6 , str ) ; kputs ( bwa_rg_id , str ) ; }
2013-05-23 07:55:07 +08:00
if ( ! ( p - > flag & 0x100 ) ) { // not multi-hit
2013-05-23 07:45:16 +08:00
for ( i = 0 ; i < n ; + + i )
2013-05-23 07:55:07 +08:00
if ( i ! = which & & ! ( list [ i ] . flag & 0x100 ) ) break ;
2013-05-23 07:45:16 +08:00
if ( i < n ) { // there are other primary hits; output them
2013-05-24 00:48:18 +08:00
kputsn ( " \t SA:Z: " , 6 , str ) ;
2013-05-23 07:45:16 +08:00
for ( i = 0 ; i < n ; + + i ) {
const mem_aln_t * r = & list [ i ] ;
int k ;
2013-05-23 07:55:07 +08:00
if ( i = = which | | ( list [ i ] . flag & 0x100 ) ) continue ; // proceed if: 1) different from the current; 2) not shadowed multi hit
2013-05-23 07:45:16 +08:00
kputs ( bns - > anns [ r - > rid ] . name , str ) ; kputc ( ' , ' , str ) ;
kputl ( r - > pos + 1 , str ) ; kputc ( ' , ' , str ) ;
kputc ( " +- " [ r - > is_rev ] , str ) ; kputc ( ' , ' , str ) ;
for ( k = 0 ; k < r - > n_cigar ; + + k ) {
kputw ( r - > cigar [ k ] > > 4 , str ) ; kputc ( " MIDSH " [ r - > cigar [ k ] & 0xf ] , str ) ;
}
kputc ( ' , ' , str ) ; kputw ( r - > mapq , str ) ;
kputc ( ' , ' , str ) ; kputw ( r - > NM , str ) ;
kputc ( ' ; ' , str ) ;
2013-03-12 11:43:58 +08:00
}
}
}
2013-02-24 05:57:34 +08:00
if ( s - > comment ) { kputc ( ' \t ' , str ) ; kputs ( s - > comment , str ) ; }
2013-02-12 04:29:03 +08:00
kputc ( ' \n ' , str ) ;
}
2013-02-08 03:36:18 +08:00
/************************
* Integrated interface *
* * * * * * * * * * * * * * * * * * * * * * * */
2013-02-19 13:50:39 +08:00
int mem_approx_mapq_se ( const mem_opt_t * opt , const mem_alnreg_t * a )
2013-02-09 02:43:15 +08:00
{
int mapq , l , sub = a - > sub ? a - > sub : opt - > min_seed_len * opt - > a ;
double identity ;
2013-02-09 03:46:57 +08:00
sub = a - > csub > sub ? a - > csub : sub ;
2013-02-23 03:47:57 +08:00
if ( sub > = a - > score ) return 0 ;
2013-02-09 02:43:15 +08:00
l = a - > qe - a - > qb > a - > re - a - > rb ? a - > qe - a - > qb : a - > re - a - > rb ;
identity = 1. - ( double ) ( l * opt - > a - a - > score ) / ( opt - > a + opt - > b ) / l ;
2013-09-07 00:31:47 +08:00
if ( a - > score = = 0 ) {
mapq = 0 ;
} else if ( opt - > mapQ_coef_len > 0 ) {
double tmp ;
2013-09-07 00:37:38 +08:00
tmp = l < opt - > mapQ_coef_len ? 1. : opt - > mapQ_coef_fac / log ( l ) ;
2013-09-10 05:51:05 +08:00
tmp * = identity * identity ;
2013-09-07 00:37:38 +08:00
mapq = ( int ) ( 6.02 * ( a - > score - sub ) / opt - > a * tmp * tmp + .499 ) ;
2013-09-07 00:31:47 +08:00
} else {
mapq = ( int ) ( MEM_MAPQ_COEF * ( 1. - ( double ) sub / a - > score ) * log ( a - > seedcov ) + .499 ) ;
mapq = identity < 0.95 ? ( int ) ( mapq * identity * identity + .499 ) : mapq ;
}
2013-02-26 02:00:35 +08:00
if ( a - > sub_n > 0 ) mapq - = ( int ) ( 4.343 * log ( a - > sub_n + 1 ) + .499 ) ;
2013-02-09 02:43:15 +08:00
if ( mapq > 60 ) mapq = 60 ;
2013-02-09 06:20:44 +08:00
if ( mapq < 0 ) mapq = 0 ;
2013-02-09 02:43:15 +08:00
return mapq ;
}
2013-05-23 08:02:53 +08:00
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
2013-03-12 09:59:15 +08:00
void mem_reg2sam_se ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , bseq1_t * s , mem_alnreg_v * a , int extra_flag , const mem_aln_t * m )
2013-02-08 02:13:43 +08:00
{
2013-02-08 03:36:18 +08:00
kstring_t str ;
2013-03-19 13:04:57 +08:00
kvec_t ( mem_aln_t ) aa ;
int k ;
kv_init ( aa ) ;
2013-02-08 03:36:18 +08:00
str . l = str . m = 0 ; str . s = 0 ;
2013-03-19 13:04:57 +08:00
for ( k = 0 ; k < a - > n ; + + k ) {
mem_alnreg_t * p = & a - > a [ k ] ;
mem_aln_t * q ;
if ( p - > score < opt - > T ) continue ;
if ( p - > secondary > = 0 & & ! ( opt - > flag & MEM_F_ALL ) ) continue ;
if ( p - > secondary > = 0 & & p - > score < a - > a [ p - > secondary ] . score * .5 ) continue ;
q = kv_pushp ( mem_aln_t , aa ) ;
* q = mem_reg2aln ( opt , bns , pac , s - > l_seq , s - > seq , p ) ;
2013-05-23 07:55:07 +08:00
q - > flag | = extra_flag ; // flag secondary
2013-03-19 13:04:57 +08:00
if ( p - > secondary > = 0 ) q - > sub = - 1 ; // don't output sub-optimal score
2013-05-23 07:45:16 +08:00
if ( k & & p - > secondary < 0 ) // if supplementary
q - > flag | = ( opt - > flag & MEM_F_NO_MULTI ) ? 0x10000 : 0x800 ;
2013-03-19 13:04:57 +08:00
if ( k & & q - > mapq > aa . a [ 0 ] . mapq ) q - > mapq = aa . a [ 0 ] . mapq ;
}
if ( aa . n = = 0 ) { // no alignments good enough; then write an unaligned record
2013-03-12 09:55:52 +08:00
mem_aln_t t ;
t = mem_reg2aln ( opt , bns , pac , s - > l_seq , s - > seq , 0 ) ;
2013-03-12 11:01:51 +08:00
t . flag | = extra_flag ;
2013-03-12 09:55:52 +08:00
mem_aln2sam ( bns , & str , s , 1 , & t , 0 , m ) ;
2013-03-19 13:04:57 +08:00
} else {
for ( k = 0 ; k < aa . n ; + + k )
mem_aln2sam ( bns , & str , s , aa . n , aa . a , k , m ) ;
for ( k = 0 ; k < aa . n ; + + k ) free ( aa . a [ k ] . cigar ) ;
free ( aa . a ) ;
2013-03-09 01:40:31 +08:00
}
2013-02-08 03:36:18 +08:00
s - > sam = str . s ;
}
2013-02-28 11:28:29 +08:00
mem_alnreg_v mem_align1_core ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , char * seq )
2013-02-08 03:36:18 +08:00
{
2013-02-27 11:55:44 +08:00
int i ;
2013-02-08 03:36:18 +08:00
mem_chain_v chn ;
2013-02-27 11:55:44 +08:00
mem_alnreg_v regs ;
for ( i = 0 ; i < l_seq ; + + i ) // convert to 2-bit encoding if we have not done so
2013-02-25 02:09:29 +08:00
seq [ i ] = seq [ i ] < 4 ? seq [ i ] : nst_nt4_table [ ( int ) seq [ i ] ] ;
2013-02-27 11:55:44 +08:00
2013-03-05 00:35:23 +08:00
chn = mem_chain ( opt , bwt , bns - > l_pac , l_seq , ( uint8_t * ) seq ) ;
2013-02-08 03:36:18 +08:00
chn . n = mem_chain_flt ( opt , chn . n , chn . a ) ;
2013-02-24 05:57:34 +08:00
if ( bwa_verbose > = 4 ) mem_print_chain ( bns , & chn ) ;
2013-02-27 11:55:44 +08:00
kv_init ( regs ) ;
2013-02-08 03:36:18 +08:00
for ( i = 0 ; i < chn . n ; + + i ) {
2013-02-27 05:26:46 +08:00
mem_chain_t * p = & chn . a [ i ] ;
2013-03-05 03:43:49 +08:00
int ret ;
2013-09-09 23:36:50 +08:00
if ( bwa_verbose > = 4 ) err_printf ( " ===> Processing chain(%d) <=== \n " , i ) ;
2013-03-05 03:43:49 +08:00
ret = mem_chain2aln_short ( opt , bns - > l_pac , pac , l_seq , ( uint8_t * ) seq , p , & regs ) ;
if ( ret > 0 ) mem_chain2aln ( opt , bns - > l_pac , pac , l_seq , ( uint8_t * ) seq , p , & regs ) ;
2013-02-08 03:36:18 +08:00
free ( chn . a [ i ] . seeds ) ;
}
2013-02-27 11:55:44 +08:00
free ( chn . a ) ;
2013-09-09 23:36:50 +08:00
regs . n = mem_sort_and_dedup ( regs . n , regs . a , opt - > mask_level_redun ) ;
2013-02-08 03:36:18 +08:00
return regs ;
2013-02-08 02:13:43 +08:00
}
2013-03-02 00:14:51 +08:00
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , const char * seq_ )
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
2013-02-28 11:28:29 +08:00
mem_alnreg_v ar ;
2013-03-02 00:14:51 +08:00
char * seq ;
2013-05-02 22:12:01 +08:00
seq = malloc ( l_seq ) ;
2013-03-02 00:14:51 +08:00
memcpy ( seq , seq_ , l_seq ) ; // makes a copy of seq_
2013-02-28 11:28:29 +08:00
ar = mem_align1_core ( opt , bwt , bns , pac , l_seq , seq ) ;
2013-11-23 22:36:26 +08:00
mem_mark_primary_se ( opt , ar . n , ar . a , lrand48 ( ) ) ;
2013-03-02 00:14:51 +08:00
free ( seq ) ;
2013-02-28 11:28:29 +08:00
return ar ;
}
// This routine is only used for the API purpose
2013-03-02 00:14:51 +08:00
mem_aln_t mem_reg2aln ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , int l_query , const char * query_ , const mem_alnreg_t * ar )
2013-02-28 11:28:29 +08:00
{
mem_aln_t a ;
2014-01-30 01:05:11 +08:00
int i , w2 , qb , qe , NM , score , is_rev , last_sc = - ( 1 < < 30 ) , l_MD ;
2013-03-12 09:25:17 +08:00
int64_t pos , rb , re ;
2013-03-02 00:14:51 +08:00
uint8_t * query ;
2013-03-12 09:25:17 +08:00
memset ( & a , 0 , sizeof ( mem_aln_t ) ) ;
if ( ar = = 0 | | ar - > rb < 0 | | ar - > re < 0 ) { // generate an unmapped record
a . rid = - 1 ; a . pos = - 1 ; a . flag | = 0x4 ;
return a ;
}
qb = ar - > qb , qe = ar - > qe ;
rb = ar - > rb , re = ar - > re ;
2013-05-02 22:12:01 +08:00
query = malloc ( l_query ) ;
2013-03-02 00:14:51 +08:00
for ( i = 0 ; i < l_query ; + + i ) // convert to the nt4 encoding
query [ i ] = query_ [ i ] < 5 ? query_ [ i ] : nst_nt4_table [ ( int ) query_ [ i ] ] ;
2013-03-12 21:56:04 +08:00
a . mapq = ar - > secondary < 0 ? mem_approx_mapq_se ( opt , ar ) : 0 ;
2013-05-23 07:55:07 +08:00
if ( ar - > secondary > = 0 ) a . flag | = 0x100 ; // secondary alignment
2013-05-23 07:45:16 +08:00
if ( bwa_fix_xref ( opt - > mat , opt - > q , opt - > r , opt - > w , bns , pac , ( uint8_t * ) query , & qb , & qe , & rb , & re ) < 0 ) {
fprintf ( stderr , " [E::%s] If you see this message, please let the developer know. Abort. Sorry. \n " , __func__ ) ;
exit ( 1 ) ;
2013-03-19 08:49:32 +08:00
}
2013-03-16 09:26:37 +08:00
w2 = infer_bw ( qe - qb , re - rb , ar - > truesc , opt - > a , opt - > q , opt - > r ) ;
2013-12-31 04:49:41 +08:00
if ( bwa_verbose > = 4 ) printf ( " Band width: infer=%d, opt=%d, alnreg=%d \n " , w2 , opt - > w , ar - > w ) ;
2013-11-20 22:50:46 +08:00
if ( w2 > opt - > w ) w2 = w2 < ar - > w ? w2 : ar - > w ;
else w2 = opt - > w ;
2013-11-20 23:04:16 +08:00
i = 0 ; a . cigar = 0 ;
do {
free ( a . cigar ) ;
a . cigar = bwa_gen_cigar ( opt - > mat , opt - > q , opt - > r , w2 , bns - > l_pac , pac , qe - qb , ( uint8_t * ) & query [ qb ] , rb , re , & score , & a . n_cigar , & NM ) ;
2013-12-31 04:49:41 +08:00
if ( bwa_verbose > = 4 ) printf ( " Final alignment: w2=%d, global_sc=%d, local_sc=%d \n " , w2 , score , ar - > truesc ) ;
2014-01-29 23:51:02 +08:00
if ( score = = last_sc ) break ; // it is possible that global alignment and local alignment give different scores
last_sc = score ;
2013-11-20 23:04:16 +08:00
w2 < < = 1 ;
} while ( + + i < 3 & & score < ar - > truesc - opt - > a ) ;
2014-01-30 01:05:11 +08:00
l_MD = strlen ( ( char * ) ( a . cigar + a . n_cigar ) ) + 1 ;
2013-02-28 11:28:29 +08:00
a . NM = NM ;
2013-02-28 12:40:46 +08:00
pos = bns_depos ( bns , rb < bns - > l_pac ? rb : re - 1 , & is_rev ) ;
a . is_rev = is_rev ;
2014-01-30 01:05:11 +08:00
if ( a . n_cigar > 0 ) { // squeeze out leading or trailing deletions
2013-04-04 11:57:19 +08:00
if ( ( a . cigar [ 0 ] & 0xf ) = = 2 ) {
pos + = a . cigar [ 0 ] > > 4 ;
- - a . n_cigar ;
2014-01-30 01:05:11 +08:00
memmove ( a . cigar , a . cigar + 1 , a . n_cigar * 4 + l_MD ) ;
} else if ( ( a . cigar [ a . n_cigar - 1 ] & 0xf ) = = 2 ) {
- - a . n_cigar ;
memmove ( a . cigar + a . n_cigar , a . cigar + a . n_cigar + 1 , l_MD ) ; // MD needs to be moved accordingly
}
2013-04-04 11:57:19 +08:00
}
2013-02-28 11:45:18 +08:00
if ( qb ! = 0 | | qe ! = l_query ) { // add clipping to CIGAR
int clip5 , clip3 ;
clip5 = is_rev ? l_query - qe : qb ;
clip3 = is_rev ? qb : l_query - qe ;
2014-01-30 01:05:11 +08:00
a . cigar = realloc ( a . cigar , 4 * ( a . n_cigar + 2 ) + l_MD ) ;
2013-02-28 11:45:18 +08:00
if ( clip5 ) {
2014-01-30 01:05:11 +08:00
memmove ( a . cigar + 1 , a . cigar , a . n_cigar * 4 + l_MD ) ; // make room for 5'-end clipping
2013-05-23 07:45:16 +08:00
a . cigar [ 0 ] = clip5 < < 4 | 3 ;
2013-02-28 11:45:18 +08:00
+ + a . n_cigar ;
}
2014-01-30 01:05:11 +08:00
if ( clip3 ) {
memmove ( a . cigar + a . n_cigar + 1 , a . cigar + a . n_cigar , l_MD ) ; // make room for 3'-end clipping
a . cigar [ a . n_cigar + + ] = clip3 < < 4 | 3 ;
}
2013-02-28 11:45:18 +08:00
}
2013-02-28 11:28:29 +08:00
a . rid = bns_pos2rid ( bns , pos ) ;
a . pos = pos - bns - > anns [ a . rid ] . offset ;
2013-03-12 11:16:18 +08:00
a . score = ar - > score ; a . sub = ar - > sub > ar - > csub ? ar - > sub : ar - > csub ;
2013-03-02 00:14:51 +08:00
free ( query ) ;
2013-02-28 11:28:29 +08:00
return a ;
}
2013-02-08 02:29:01 +08:00
typedef struct {
const mem_opt_t * opt ;
const bwt_t * bwt ;
const bntseq_t * bns ;
const uint8_t * pac ;
2013-02-12 04:29:03 +08:00
const mem_pestat_t * pes ;
2013-02-08 02:29:01 +08:00
bseq1_t * seqs ;
2013-02-08 03:36:18 +08:00
mem_alnreg_v * regs ;
2013-11-23 22:36:26 +08:00
int64_t n_processed ;
2013-02-08 03:36:18 +08:00
} worker_t ;
2013-02-08 02:29:01 +08:00
2013-11-03 00:13:11 +08:00
static void worker1 ( void * data , int i , int tid )
2013-02-08 02:29:01 +08:00
{
2013-02-08 03:36:18 +08:00
worker_t * w = ( worker_t * ) data ;
2013-02-26 04:40:15 +08:00
if ( ! ( w - > opt - > flag & MEM_F_PE ) ) {
2013-11-03 00:13:11 +08:00
w - > regs [ i ] = mem_align1_core ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i ] . l_seq , w - > seqs [ i ] . seq ) ;
} else {
w - > regs [ i < < 1 | 0 ] = mem_align1_core ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i < < 1 | 0 ] . l_seq , w - > seqs [ i < < 1 | 0 ] . seq ) ;
w - > regs [ i < < 1 | 1 ] = mem_align1_core ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i < < 1 | 1 ] . l_seq , w - > seqs [ i < < 1 | 1 ] . seq ) ;
2013-02-26 04:40:15 +08:00
}
2013-02-08 03:36:18 +08:00
}
2013-11-03 00:13:11 +08:00
static void worker2 ( void * data , int i , int tid )
2013-02-08 03:36:18 +08:00
{
2013-02-19 05:33:06 +08:00
extern int mem_sam_pe ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , const mem_pestat_t pes [ 4 ] , uint64_t id , bseq1_t s [ 2 ] , mem_alnreg_v a [ 2 ] ) ;
2013-02-08 03:36:18 +08:00
worker_t * w = ( worker_t * ) data ;
2013-02-19 05:33:06 +08:00
if ( ! ( w - > opt - > flag & MEM_F_PE ) ) {
2013-11-23 22:36:26 +08:00
mem_mark_primary_se ( w - > opt , w - > regs [ i ] . n , w - > regs [ i ] . a , w - > n_processed + i ) ;
2013-11-03 00:13:11 +08:00
mem_reg2sam_se ( w - > opt , w - > bns , w - > pac , & w - > seqs [ i ] , & w - > regs [ i ] , 0 , 0 ) ;
free ( w - > regs [ i ] . a ) ;
2013-02-08 03:36:18 +08:00
} else {
2013-11-23 22:36:26 +08:00
mem_sam_pe ( w - > opt , w - > bns , w - > pac , w - > pes , ( w - > n_processed > > 1 ) + i , & w - > seqs [ i < < 1 ] , & w - > regs [ i < < 1 ] ) ;
2013-11-03 00:13:11 +08:00
free ( w - > regs [ i < < 1 | 0 ] . a ) ; free ( w - > regs [ i < < 1 | 1 ] . a ) ;
2013-02-08 03:36:18 +08:00
}
2013-02-08 02:29:01 +08:00
}
2013-11-23 22:36:26 +08:00
void mem_process_seqs ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int64_t n_processed , int n , bseq1_t * seqs , const mem_pestat_t * pes0 )
2013-02-08 02:13:43 +08:00
{
2013-11-03 00:13:11 +08:00
extern void kt_for ( int n_threads , void ( * func ) ( void * , int , int ) , void * data , int n ) ;
worker_t w ;
2013-02-08 03:57:22 +08:00
mem_alnreg_v * regs ;
2013-02-11 23:59:38 +08:00
mem_pestat_t pes [ 4 ] ;
2014-02-19 23:54:26 +08:00
double ctime , rtime ;
2013-02-11 23:59:38 +08:00
2014-02-19 23:54:26 +08:00
ctime = cputime ( ) ; rtime = realtime ( ) ;
2013-05-02 22:12:01 +08:00
regs = malloc ( n * sizeof ( mem_alnreg_v ) ) ;
2013-11-23 22:36:26 +08:00
w . opt = opt ; w . bwt = bwt ; w . bns = bns ; w . pac = pac ;
w . seqs = seqs ; w . regs = regs ; w . n_processed = n_processed ;
w . pes = & pes [ 0 ] ;
2013-11-03 00:13:11 +08:00
kt_for ( opt - > n_threads , worker1 , & w , ( opt - > flag & MEM_F_PE ) ? n > > 1 : n ) ; // find mapping positions
if ( opt - > flag & MEM_F_PE ) { // infer insert sizes if not provided
if ( pes0 ) memcpy ( pes , pes0 , 4 * sizeof ( mem_pestat_t ) ) ; // if pes0 != NULL, set the insert-size distribution as pes0
else mem_pestat ( opt , bns - > l_pac , n , regs , pes ) ; // otherwise, infer the insert size distribution from data
2013-02-08 02:29:01 +08:00
}
2013-11-03 00:13:11 +08:00
kt_for ( opt - > n_threads , worker2 , & w , ( opt - > flag & MEM_F_PE ) ? n > > 1 : n ) ; // generate alignment
free ( regs ) ;
2014-02-19 23:54:26 +08:00
if ( bwa_verbose > = 3 )
fprintf ( stderr , " [M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec \n " , __func__ , n , cputime ( ) - ctime , realtime ( ) - rtime ) ;
2013-02-08 02:13:43 +08:00
}