2013-02-01 02:59:48 +08:00
# include <stdlib.h>
2013-02-01 04:55:22 +08:00
# include <string.h>
# include <stdio.h>
2013-02-05 01:37:38 +08:00
# include <assert.h>
2013-02-08 06:15:45 +08:00
# include <math.h>
2013-02-08 02:29:01 +08:00
# ifdef HAVE_PTHREAD
# include <pthread.h>
# endif
2013-02-25 02:09:29 +08:00
2013-02-08 03:36:18 +08:00
# include "kstring.h"
2013-02-01 02:59:48 +08:00
# include "bwamem.h"
2013-02-02 05:39:50 +08:00
# include "bntseq.h"
2013-02-05 01:37:38 +08:00
# include "ksw.h"
2013-02-12 23:36:15 +08:00
# include "kvec.h"
2013-02-05 06:23:06 +08:00
# include "ksort.h"
2013-02-27 11:55:44 +08:00
# include "utils.h"
2013-02-05 01:37:38 +08:00
2013-02-20 14:11:38 +08:00
/* Theory on probability and scoring *ungapped* alignment
*
* s ' ( a , b ) = log [ P ( b | a ) / P ( b ) ] = log [ 4 P ( b | a ) ] , assuming uniform base distribution
* s ' ( a , a ) = log ( 4 ) , s ' ( a , b ) = log ( 4 e / 3 ) , where e is the error rate
*
* Scale s ' ( a , b ) to s ( a , a ) s . t . s ( a , a ) = x . Then s ( a , b ) = x * s ' ( a , b ) / log ( 4 ) , or conversely : s ' ( a , b ) = s ( a , b ) * log ( 4 ) / x
*
* If the matching score is x and mismatch penalty is - y , we can compute error rate e :
* e = .75 * exp [ - log ( 4 ) * y / x ]
*
* log P ( seq ) = \ sum_i log P ( b_i | a_i ) = \ sum_i { s ' ( a , b ) - log ( 4 ) }
* = \ sum_i { s ( a , b ) * log ( 4 ) / x - log ( 4 ) } = log ( 4 ) * ( S / x - l )
*
* where S = \ sum_i s ( a , b ) is the alignment score . Converting to the phred scale :
* Q ( seq ) = - 10 / log ( 10 ) * log P ( seq ) = 10 * log ( 4 ) / log ( 10 ) * ( l - S / x ) = 6.02 * ( l - S / x )
*
*
* Gap open ( zero gap ) : q ' = log [ P ( gap - open ) ] , r ' = log [ P ( gap - ext ) ] ( see Durbin et al . ( 1998 ) Section 4.1 )
* Then q = x * log [ P ( gap - open ) ] / log ( 4 ) , r = x * log [ P ( gap - ext ) ] / log ( 4 )
2013-02-21 08:11:44 +08:00
*
* When there are gaps , l should be the length of alignment matches ( i . e . the M operator in CIGAR )
2013-02-20 14:11:38 +08:00
*/
2013-02-02 05:39:50 +08:00
mem_opt_t * mem_opt_init ( )
2013-02-01 02:59:48 +08:00
{
2013-02-02 05:39:50 +08:00
mem_opt_t * o ;
o = calloc ( 1 , sizeof ( mem_opt_t ) ) ;
2013-02-19 05:33:06 +08:00
o - > flag = 0 ;
2013-02-28 10:13:39 +08:00
o - > a = 1 ; o - > b = 4 ; o - > q = 6 ; o - > r = 1 ; o - > w = 100 ;
o - > pen_unpaired = 9 ;
o - > pen_clip = 5 ;
2013-02-08 08:50:37 +08:00
o - > min_seed_len = 19 ;
2013-02-09 05:56:28 +08:00
o - > split_width = 10 ;
o - > max_occ = 10000 ;
2013-02-01 04:55:22 +08:00
o - > max_chain_gap = 10000 ;
2013-02-11 23:59:38 +08:00
o - > max_ins = 10000 ;
2013-02-05 13:17:20 +08:00
o - > mask_level = 0.50 ;
2013-02-05 13:41:07 +08:00
o - > chain_drop_ratio = 0.50 ;
2013-02-22 01:52:00 +08:00
o - > split_factor = 1.5 ;
2013-02-08 02:13:43 +08:00
o - > chunk_size = 10000000 ;
o - > n_threads = 1 ;
2013-02-26 00:56:02 +08:00
o - > max_matesw = 100 ;
2013-02-05 01:37:38 +08:00
mem_fill_scmat ( o - > a , o - > b , o - > mat ) ;
2013-02-01 02:59:48 +08:00
return o ;
}
2013-02-26 00:56:02 +08:00
void mem_fill_scmat ( int a , int b , int8_t mat [ 25 ] )
{
int i , j , k ;
for ( i = k = 0 ; i < 4 ; + + i ) {
for ( j = 0 ; j < 4 ; + + j )
mat [ k + + ] = i = = j ? a : - b ;
mat [ k + + ] = 0 ; // ambiguous base
}
for ( j = 0 ; j < 5 ; + + j ) mat [ k + + ] = 0 ;
}
2013-02-01 02:59:48 +08:00
/***************************
* SMEM iterator interface *
* * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-02 03:38:44 +08:00
struct __smem_i {
const bwt_t * bwt ;
const uint8_t * query ;
int start , len ;
bwtintv_v * matches ; // matches; to be returned by smem_next()
bwtintv_v * sub ; // sub-matches inside the longest match; temporary
bwtintv_v * tmpvec [ 2 ] ; // temporary arrays
} ;
2013-02-01 02:59:48 +08:00
smem_i * smem_itr_init ( const bwt_t * bwt )
{
smem_i * itr ;
itr = calloc ( 1 , sizeof ( smem_i ) ) ;
itr - > bwt = bwt ;
itr - > tmpvec [ 0 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > tmpvec [ 1 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > matches = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
2013-02-02 03:20:38 +08:00
itr - > sub = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
2013-02-01 02:59:48 +08:00
return itr ;
}
void smem_itr_destroy ( smem_i * itr )
{
2013-02-02 04:26:34 +08:00
free ( itr - > tmpvec [ 0 ] - > a ) ; free ( itr - > tmpvec [ 0 ] ) ;
free ( itr - > tmpvec [ 1 ] - > a ) ; free ( itr - > tmpvec [ 1 ] ) ;
free ( itr - > matches - > a ) ; free ( itr - > matches ) ;
free ( itr - > sub - > a ) ; free ( itr - > sub ) ;
2013-02-01 02:59:48 +08:00
free ( itr ) ;
}
2013-02-02 03:20:38 +08:00
void smem_set_query ( smem_i * itr , int len , const uint8_t * query )
2013-02-01 02:59:48 +08:00
{
itr - > query = query ;
itr - > start = 0 ;
itr - > len = len ;
}
2013-02-09 05:56:28 +08:00
const bwtintv_v * smem_next ( smem_i * itr , int split_len , int split_width )
2013-02-01 02:59:48 +08:00
{
2013-02-09 05:56:28 +08:00
int i , max , max_i , ori_start ;
2013-02-02 03:38:44 +08:00
itr - > tmpvec [ 0 ] - > n = itr - > tmpvec [ 1 ] - > n = itr - > matches - > n = itr - > sub - > n = 0 ;
if ( itr - > start > = itr - > len | | itr - > start < 0 ) return 0 ;
2013-02-01 02:59:48 +08:00
while ( itr - > start < itr - > len & & itr - > query [ itr - > start ] > 3 ) + + itr - > start ; // skip ambiguous bases
2013-02-02 03:38:44 +08:00
if ( itr - > start = = itr - > len ) return 0 ;
2013-02-09 05:56:28 +08:00
ori_start = itr - > start ;
itr - > start = bwt_smem1 ( itr - > bwt , itr - > len , itr - > query , ori_start , 1 , itr - > matches , itr - > tmpvec ) ; // search for SMEM
2013-02-02 03:38:44 +08:00
if ( itr - > matches - > n = = 0 ) return itr - > matches ; // well, in theory, we should never come here
for ( i = max = 0 , max_i = 0 ; i < itr - > matches - > n ; + + i ) { // look for the longest match
2013-02-02 03:20:38 +08:00
bwtintv_t * p = & itr - > matches - > a [ i ] ;
int len = ( uint32_t ) p - > info - ( p - > info > > 32 ) ;
if ( max < len ) max = len , max_i = i ;
}
2013-02-09 05:56:28 +08:00
if ( split_len > 0 & & max > = split_len & & itr - > matches - > a [ max_i ] . x [ 2 ] < = split_width ) { // if the longest SMEM is unique and long
2013-02-02 03:20:38 +08:00
int j ;
2013-02-02 03:38:44 +08:00
bwtintv_v * a = itr - > tmpvec [ 0 ] ; // reuse tmpvec[0] for merging
2013-02-02 03:20:38 +08:00
bwtintv_t * p = & itr - > matches - > a [ max_i ] ;
2013-02-09 05:56:28 +08:00
bwt_smem1 ( itr - > bwt , itr - > len , itr - > query , ( ( uint32_t ) p - > info + ( p - > info > > 32 ) ) > > 1 , itr - > matches - > a [ max_i ] . x [ 2 ] + 1 , itr - > sub , itr - > tmpvec ) ; // starting from the middle of the longest MEM
2013-02-02 03:20:38 +08:00
i = j = 0 ; a - > n = 0 ;
while ( i < itr - > matches - > n & & j < itr - > sub - > n ) { // ordered merge
2013-02-05 05:48:11 +08:00
int64_t xi = itr - > matches - > a [ i ] . info > > 32 < < 32 | ( itr - > len - ( uint32_t ) itr - > matches - > a [ i ] . info ) ;
2013-02-05 13:41:07 +08:00
int64_t xj = itr - > sub - > a [ j ] . info > > 32 < < 32 | ( itr - > len - ( uint32_t ) itr - > sub - > a [ j ] . info ) ;
2013-02-05 05:48:11 +08:00
if ( xi < xj ) {
2013-02-02 03:20:38 +08:00
kv_push ( bwtintv_t , * a , itr - > matches - > a [ i ] ) ;
+ + i ;
2013-02-09 05:56:28 +08:00
} else if ( ( uint32_t ) itr - > sub - > a [ j ] . info - ( itr - > sub - > a [ j ] . info > > 32 ) > = max > > 1 & & ( uint32_t ) itr - > sub - > a [ j ] . info > ori_start ) {
2013-02-02 03:20:38 +08:00
kv_push ( bwtintv_t , * a , itr - > sub - > a [ j ] ) ;
+ + j ;
2013-02-08 08:50:37 +08:00
} else + + j ;
2013-02-02 03:20:38 +08:00
}
for ( ; i < itr - > matches - > n ; + + i ) kv_push ( bwtintv_t , * a , itr - > matches - > a [ i ] ) ;
2013-02-08 08:50:37 +08:00
for ( ; j < itr - > sub - > n ; + + j )
2013-02-09 05:56:28 +08:00
if ( ( uint32_t ) itr - > sub - > a [ j ] . info - ( itr - > sub - > a [ j ] . info > > 32 ) > = max > > 1 & & ( uint32_t ) itr - > sub - > a [ j ] . info > ori_start )
2013-02-08 08:50:37 +08:00
kv_push ( bwtintv_t , * a , itr - > sub - > a [ j ] ) ;
2013-02-02 03:20:38 +08:00
kv_copy ( bwtintv_t , * itr - > matches , * a ) ;
}
2013-02-02 03:38:44 +08:00
return itr - > matches ;
2013-02-01 02:59:48 +08:00
}
2013-02-05 06:23:06 +08:00
/********************************
* Chaining while finding SMEMs *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-25 01:17:29 +08:00
typedef struct {
int64_t rbeg ;
int32_t qbeg , len ;
} mem_seed_t ;
typedef struct {
int n , m ;
int64_t pos ;
mem_seed_t * seeds ;
} mem_chain_t ;
typedef struct { size_t n , m ; mem_chain_t * a ; } mem_chain_v ;
2013-02-01 04:55:22 +08:00
# include "kbtree.h"
2013-02-22 00:42:30 +08:00
# define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
2013-02-08 02:13:43 +08:00
KBTREE_INIT ( chn , mem_chain_t , chain_cmp )
2013-02-01 04:55:22 +08:00
2013-02-08 02:13:43 +08:00
static int test_and_merge ( const mem_opt_t * opt , mem_chain_t * c , const mem_seed_t * p )
2013-02-01 04:55:22 +08:00
{
int64_t qend , rend , x , y ;
2013-02-02 05:39:50 +08:00
const mem_seed_t * last = & c - > seeds [ c - > n - 1 ] ;
2013-02-01 04:55:22 +08:00
qend = last - > qbeg + last - > len ;
rend = last - > rbeg + last - > len ;
2013-02-01 05:39:24 +08:00
if ( p - > qbeg > = c - > seeds [ 0 ] . qbeg & & p - > qbeg + p - > len < = qend & & p - > rbeg > = c - > seeds [ 0 ] . rbeg & & p - > rbeg + p - > len < = rend )
2013-02-01 04:55:22 +08:00
return 1 ; // contained seed; do nothing
2013-02-05 05:48:11 +08:00
x = p - > qbeg - last - > qbeg ; // always non-negtive
2013-02-01 04:55:22 +08:00
y = p - > rbeg - last - > rbeg ;
2013-02-05 05:48:11 +08:00
if ( y > = 0 & & x - y < = opt - > w & & y - x < = opt - > w & & x - last - > len < opt - > max_chain_gap & & y - last - > len < opt - > max_chain_gap ) { // grow the chain
2013-02-01 04:55:22 +08:00
if ( c - > n = = c - > m ) {
c - > m < < = 1 ;
2013-02-02 05:39:50 +08:00
c - > seeds = realloc ( c - > seeds , c - > m * sizeof ( mem_seed_t ) ) ;
2013-02-01 04:55:22 +08:00
}
c - > seeds [ c - > n + + ] = * p ;
return 1 ;
}
2013-02-02 03:38:44 +08:00
return 0 ; // request to add a new chain
2013-02-01 04:55:22 +08:00
}
2013-02-02 05:39:50 +08:00
static void mem_insert_seed ( const mem_opt_t * opt , kbtree_t ( chn ) * tree , smem_i * itr )
2013-02-01 04:55:22 +08:00
{
2013-02-02 03:38:44 +08:00
const bwtintv_v * a ;
2013-02-22 01:52:00 +08:00
int split_len = ( int ) ( opt - > min_seed_len * opt - > split_factor + .499 ) ;
2013-02-26 06:29:35 +08:00
split_len = split_len < itr - > len ? split_len : itr - > len ;
2013-02-22 01:52:00 +08:00
while ( ( a = smem_next ( itr , split_len , opt - > split_width ) ) ! = 0 ) { // to find all SMEM and some internal MEM
2013-02-01 04:55:22 +08:00
int i ;
2013-02-02 03:38:44 +08:00
for ( i = 0 ; i < a - > n ; + + i ) { // go through each SMEM/MEM up to itr->start
bwtintv_t * p = & a - > a [ i ] ;
2013-02-01 04:55:22 +08:00
int slen = ( uint32_t ) p - > info - ( p - > info > > 32 ) ; // seed length
int64_t k ;
2013-02-02 03:38:44 +08:00
if ( slen < opt - > min_seed_len | | p - > x [ 2 ] > opt - > max_occ ) continue ; // ignore if too short or too repetitive
2013-02-01 04:55:22 +08:00
for ( k = 0 ; k < p - > x [ 2 ] ; + + k ) {
2013-02-08 02:13:43 +08:00
mem_chain_t tmp , * lower , * upper ;
2013-02-02 05:39:50 +08:00
mem_seed_t s ;
2013-02-01 04:55:22 +08:00
int to_add = 0 ;
2013-02-02 03:38:44 +08:00
s . rbeg = tmp . pos = bwt_sa ( itr - > bwt , p - > x [ 0 ] + k ) ; // this is the base coordinate in the forward-reverse reference
2013-02-01 05:26:05 +08:00
s . qbeg = p - > info > > 32 ;
s . len = slen ;
2013-02-01 04:55:22 +08:00
if ( kb_size ( tree ) ) {
2013-02-02 03:38:44 +08:00
kb_intervalp ( chn , tree , & tmp , & lower , & upper ) ; // find the closest chain
2013-02-01 05:26:05 +08:00
if ( ! lower | | ! test_and_merge ( opt , lower , & s ) ) to_add = 1 ;
} else to_add = 1 ;
2013-02-02 03:38:44 +08:00
if ( to_add ) { // add the seed as a new chain
2013-02-01 04:55:22 +08:00
tmp . n = 1 ; tmp . m = 4 ;
2013-02-02 05:39:50 +08:00
tmp . seeds = calloc ( tmp . m , sizeof ( mem_seed_t ) ) ;
2013-02-01 05:26:05 +08:00
tmp . seeds [ 0 ] = s ;
2013-02-01 04:55:22 +08:00
kb_putp ( chn , tree , & tmp ) ;
}
}
}
}
}
2013-02-08 05:27:11 +08:00
void mem_print_chain ( const bntseq_t * bns , mem_chain_v * chn )
{
int i , j ;
for ( i = 0 ; i < chn - > n ; + + i ) {
mem_chain_t * p = & chn - > a [ i ] ;
printf ( " %d " , p - > n ) ;
for ( j = 0 ; j < p - > n ; + + j ) {
bwtint_t pos ;
int is_rev , ref_id ;
pos = bns_depos ( bns , p - > seeds [ j ] . rbeg , & is_rev ) ;
if ( is_rev ) pos - = p - > seeds [ j ] . len - 1 ;
bns_cnt_ambi ( bns , pos , p - > seeds [ j ] . len , & ref_id ) ;
printf ( " \t %d,%d,%ld(%s:%c%ld) " , p - > seeds [ j ] . len , p - > seeds [ j ] . qbeg , ( long ) p - > seeds [ j ] . rbeg , bns - > anns [ ref_id ] . name , " +- " [ is_rev ] , ( long ) ( pos - bns - > anns [ ref_id ] . offset ) + 1 ) ;
}
putchar ( ' \n ' ) ;
}
}
2013-02-08 02:13:43 +08:00
mem_chain_v mem_chain ( const mem_opt_t * opt , const bwt_t * bwt , int len , const uint8_t * seq )
2013-02-01 04:55:22 +08:00
{
2013-02-08 02:13:43 +08:00
mem_chain_v chain ;
2013-02-01 04:55:22 +08:00
smem_i * itr ;
kbtree_t ( chn ) * tree ;
2013-02-08 02:13:43 +08:00
kv_init ( chain ) ;
2013-02-01 04:55:22 +08:00
if ( len < opt - > min_seed_len ) return chain ; // if the query is shorter than the seed length, no match
tree = kb_init ( chn , KB_DEFAULT_SIZE ) ;
itr = smem_itr_init ( bwt ) ;
2013-02-02 03:20:38 +08:00
smem_set_query ( itr , len , seq ) ;
2013-02-01 05:26:05 +08:00
mem_insert_seed ( opt , tree , itr ) ;
2013-02-08 02:13:43 +08:00
kv_resize ( mem_chain_t , chain , kb_size ( tree ) ) ;
2013-02-01 05:26:05 +08:00
2013-02-08 02:13:43 +08:00
# define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse ( mem_chain_t , tree , traverse_func ) ;
2013-02-01 05:26:05 +08:00
# undef traverse_func
2013-02-01 04:55:22 +08:00
smem_itr_destroy ( itr ) ;
kb_destroy ( chn , tree ) ;
return chain ;
}
2013-02-02 05:39:50 +08:00
2013-02-05 05:48:11 +08:00
/********************
* Filtering chains *
* * * * * * * * * * * * * * * * * * * */
2013-02-05 06:23:06 +08:00
typedef struct {
int beg , end , w ;
void * p , * p2 ;
} flt_aux_t ;
# define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT ( mem_flt , flt_aux_t , flt_lt )
2013-02-08 02:13:43 +08:00
int mem_chain_flt ( const mem_opt_t * opt , int n_chn , mem_chain_t * chains )
2013-02-05 06:23:06 +08:00
{
flt_aux_t * a ;
int i , j , n ;
2013-02-07 02:59:32 +08:00
if ( n_chn < = 1 ) return n_chn ; // no need to filter
a = malloc ( sizeof ( flt_aux_t ) * n_chn ) ;
for ( i = 0 ; i < n_chn ; + + i ) {
2013-02-08 02:13:43 +08:00
mem_chain_t * c = & chains [ i ] ;
2013-02-09 04:34:25 +08:00
int64_t end ;
int w = 0 , tmp ;
for ( j = 0 , end = 0 ; j < c - > n ; + + j ) {
const mem_seed_t * s = & c - > seeds [ j ] ;
if ( s - > qbeg > = end ) w + = s - > len ;
else if ( s - > qbeg + s - > len > end ) w + = s - > qbeg + s - > len - end ;
end = end > s - > qbeg + s - > len ? end : s - > qbeg + s - > len ;
}
tmp = w ;
for ( j = 0 , end = 0 ; j < c - > n ; + + j ) {
const mem_seed_t * s = & c - > seeds [ j ] ;
if ( s - > rbeg > = end ) w + = s - > len ;
else if ( s - > rbeg + s - > len > end ) w + = s - > rbeg + s - > len - end ;
end = end > s - > qbeg + s - > len ? end : s - > qbeg + s - > len ;
}
w = w < tmp ? w : tmp ;
2013-02-05 06:23:06 +08:00
a [ i ] . beg = c - > seeds [ 0 ] . qbeg ;
a [ i ] . end = c - > seeds [ c - > n - 1 ] . qbeg + c - > seeds [ c - > n - 1 ] . len ;
2013-02-07 02:59:32 +08:00
a [ i ] . w = w ; a [ i ] . p = c ; a [ i ] . p2 = 0 ;
2013-02-05 06:23:06 +08:00
}
2013-02-07 02:59:32 +08:00
ks_introsort ( mem_flt , n_chn , a ) ;
2013-02-06 10:58:33 +08:00
{ // reorder chains such that the best chain appears first
2013-02-08 02:13:43 +08:00
mem_chain_t * swap ;
swap = malloc ( sizeof ( mem_chain_t ) * n_chn ) ;
2013-02-07 02:59:32 +08:00
for ( i = 0 ; i < n_chn ; + + i ) {
2013-02-08 02:13:43 +08:00
swap [ i ] = * ( ( mem_chain_t * ) a [ i ] . p ) ;
2013-02-07 02:59:32 +08:00
a [ i ] . p = & chains [ i ] ; // as we will memcpy() below, a[i].p is changed
2013-02-06 10:58:33 +08:00
}
2013-02-08 02:13:43 +08:00
memcpy ( chains , swap , sizeof ( mem_chain_t ) * n_chn ) ;
2013-02-06 10:58:33 +08:00
free ( swap ) ;
}
2013-02-07 02:59:32 +08:00
for ( i = 1 , n = 1 ; i < n_chn ; + + i ) {
2013-02-05 06:23:06 +08:00
for ( j = 0 ; j < n ; + + j ) {
int b_max = a [ j ] . beg > a [ i ] . beg ? a [ j ] . beg : a [ i ] . beg ;
2013-02-05 13:17:20 +08:00
int e_min = a [ j ] . end < a [ i ] . end ? a [ j ] . end : a [ i ] . end ;
2013-02-05 06:23:06 +08:00
if ( e_min > b_max ) { // have overlap
int min_l = a [ i ] . end - a [ i ] . beg < a [ j ] . end - a [ j ] . beg ? a [ i ] . end - a [ i ] . beg : a [ j ] . end - a [ j ] . beg ;
if ( e_min - b_max > = min_l * opt - > mask_level ) { // significant overlap
if ( a [ j ] . p2 = = 0 ) a [ j ] . p2 = a [ i ] . p ;
2013-02-22 01:25:20 +08:00
if ( a [ i ] . w < a [ j ] . w * opt - > chain_drop_ratio & & a [ j ] . w - a [ i ] . w > = opt - > min_seed_len < < 1 )
2013-02-05 06:23:06 +08:00
break ;
}
}
}
if ( j = = n ) a [ n + + ] = a [ i ] ; // if have no significant overlap with better chains, keep it.
}
2013-02-05 13:17:20 +08:00
for ( i = 0 ; i < n ; + + i ) { // mark chains to be kept
2013-02-08 02:13:43 +08:00
mem_chain_t * c = ( mem_chain_t * ) a [ i ] . p ;
2013-02-05 13:17:20 +08:00
if ( c - > n > 0 ) c - > n = - c - > n ;
2013-02-08 02:13:43 +08:00
c = ( mem_chain_t * ) a [ i ] . p2 ;
2013-02-05 13:17:20 +08:00
if ( c & & c - > n > 0 ) c - > n = - c - > n ;
}
2013-02-05 06:23:06 +08:00
free ( a ) ;
2013-02-07 02:59:32 +08:00
for ( i = 0 ; i < n_chn ; + + i ) { // free discarded chains
2013-02-08 02:13:43 +08:00
mem_chain_t * c = & chains [ i ] ;
2013-02-05 13:17:20 +08:00
if ( c - > n > = 0 ) {
free ( c - > seeds ) ;
c - > n = c - > m = 0 ;
} else c - > n = - c - > n ;
}
2013-02-07 02:59:32 +08:00
for ( i = n = 0 ; i < n_chn ; + + i ) { // squeeze out discarded chains
if ( chains [ i ] . n > 0 ) {
if ( n ! = i ) chains [ n + + ] = chains [ i ] ;
2013-02-05 13:17:20 +08:00
else + + n ;
}
}
2013-02-07 02:59:32 +08:00
return n ;
2013-02-05 06:23:06 +08:00
}
2013-02-11 01:24:33 +08:00
/******************************
* De - overlap single - end hits *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-07 03:38:40 +08:00
2013-02-11 01:24:33 +08:00
# define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT ( mem_ars , mem_alnreg_t , alnreg_slt )
int mem_sort_and_dedup ( int n , mem_alnreg_t * a )
{
int m , i ;
2013-02-07 03:38:40 +08:00
if ( n < = 1 ) return n ;
2013-02-11 01:24:33 +08:00
ks_introsort ( mem_ars , n , a ) ;
2013-02-09 03:18:39 +08:00
for ( i = 1 ; i < n ; + + i ) { // mark identical hits
if ( a [ i ] . score = = a [ i - 1 ] . score & & a [ i ] . rb = = a [ i - 1 ] . rb & & a [ i ] . qb = = a [ i - 1 ] . qb )
2013-02-09 06:55:35 +08:00
a [ i ] . qe = a [ i ] . qb ;
2013-02-09 03:18:39 +08:00
}
for ( i = 1 , m = 1 ; i < n ; + + i ) // exclude identical hits
2013-02-21 09:26:57 +08:00
if ( a [ i ] . qe > a [ i ] . qb ) {
if ( m ! = i ) a [ m + + ] = a [ i ] ;
else + + m ;
}
2013-02-11 01:24:33 +08:00
return m ;
}
2013-02-13 04:34:44 +08:00
void mem_mark_primary_se ( const mem_opt_t * opt , int n , mem_alnreg_t * a ) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
2013-02-11 01:24:33 +08:00
{ // similar to the loop in mem_chain_flt()
2013-02-13 04:34:44 +08:00
int i , k , tmp ;
kvec_t ( int ) z ;
if ( n = = 0 ) return ;
kv_init ( z ) ;
2013-02-13 04:52:23 +08:00
for ( i = 0 ; i < n ; + + i ) a [ i ] . sub = 0 , a [ i ] . secondary = - 1 ;
2013-02-09 06:55:35 +08:00
tmp = opt - > a + opt - > b > opt - > q + opt - > r ? opt - > a + opt - > b : opt - > q + opt - > r ;
2013-02-13 04:34:44 +08:00
kv_push ( int , z , 0 ) ;
for ( i = 1 ; i < n ; + + i ) {
for ( k = 0 ; k < z . n ; + + k ) {
int j = z . a [ k ] ;
2013-02-07 03:38:40 +08:00
int b_max = a [ j ] . qb > a [ i ] . qb ? a [ j ] . qb : a [ i ] . qb ;
int e_min = a [ j ] . qe < a [ i ] . qe ? a [ j ] . qe : a [ i ] . qe ;
if ( e_min > b_max ) { // have overlap
int min_l = a [ i ] . qe - a [ i ] . qb < a [ j ] . qe - a [ j ] . qb ? a [ i ] . qe - a [ i ] . qb : a [ j ] . qe - a [ j ] . qb ;
if ( e_min - b_max > = min_l * opt - > mask_level ) { // significant overlap
if ( a [ j ] . sub = = 0 ) a [ j ] . sub = a [ i ] . score ;
2013-02-09 06:55:35 +08:00
if ( a [ j ] . score - a [ i ] . score < = tmp ) + + a [ j ] . sub_n ;
2013-02-07 03:38:40 +08:00
break ;
}
}
}
2013-02-13 04:34:44 +08:00
if ( k = = z . n ) kv_push ( int , z , i ) ;
2013-02-13 04:52:23 +08:00
else a [ i ] . secondary = z . a [ k ] ;
2013-02-07 03:38:40 +08:00
}
2013-02-13 04:34:44 +08:00
free ( z . a ) ;
2013-02-07 03:38:40 +08:00
}
2013-02-05 05:48:11 +08:00
/****************************************
* Construct the alignment from a chain *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-05 01:37:38 +08:00
static inline int cal_max_gap ( const mem_opt_t * opt , int qlen )
{
int l = ( int ) ( ( double ) ( qlen * opt - > a - opt - > q ) / opt - > r + 1. ) ;
2013-02-27 13:29:11 +08:00
l = l > 1 ? l : 1 ;
return l < opt - > w < < 1 ? l : opt - > w < < 1 ;
2013-02-05 01:37:38 +08:00
}
2013-02-22 03:58:51 +08:00
void mem_chain2aln ( const mem_opt_t * opt , int64_t l_pac , const uint8_t * pac , int l_query , const uint8_t * query , const mem_chain_t * c , mem_alnreg_v * av )
2013-02-05 05:48:11 +08:00
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
2013-02-22 03:58:51 +08:00
int i , k ;
2013-02-24 02:26:50 +08:00
int64_t rlen , rmax [ 2 ] , tmp , max = 0 ;
2013-02-05 05:08:00 +08:00
const mem_seed_t * s ;
2013-02-05 01:37:38 +08:00
uint8_t * rseq = 0 ;
2013-02-27 11:55:44 +08:00
uint64_t * srt ;
2013-02-05 04:02:56 +08:00
2013-02-27 11:55:44 +08:00
if ( c - > n = = 0 ) return ;
2013-02-05 01:37:38 +08:00
// get the max possible span
2013-02-05 05:08:00 +08:00
rmax [ 0 ] = l_pac < < 1 ; rmax [ 1 ] = 0 ;
for ( i = 0 ; i < c - > n ; + + i ) {
int64_t b , e ;
const mem_seed_t * t = & c - > seeds [ i ] ;
b = t - > rbeg - ( t - > qbeg + cal_max_gap ( opt , t - > qbeg ) ) ;
e = t - > rbeg + t - > len + ( ( l_query - t - > qbeg - t - > len ) + cal_max_gap ( opt , l_query - t - > qbeg - t - > len ) ) ;
rmax [ 0 ] = rmax [ 0 ] < b ? rmax [ 0 ] : b ;
rmax [ 1 ] = rmax [ 1 ] > e ? rmax [ 1 ] : e ;
2013-02-24 02:26:50 +08:00
if ( t - > len > max ) max = t - > len ;
2013-02-05 05:08:00 +08:00
}
2013-02-27 13:37:17 +08:00
rmax [ 0 ] = rmax [ 0 ] > 0 ? rmax [ 0 ] : 0 ;
rmax [ 1 ] = rmax [ 1 ] < l_pac < < 1 ? rmax [ 1 ] : l_pac < < 1 ;
if ( rmax [ 0 ] < l_pac & & l_pac < rmax [ 1 ] ) { // crossing the forward-reverse boundary; then choose one side
if ( l_pac - rmax [ 0 ] > rmax [ 1 ] - l_pac ) rmax [ 1 ] = l_pac ;
else rmax [ 0 ] = l_pac ;
}
2013-02-05 01:37:38 +08:00
// retrieve the reference sequence
rseq = bns_get_seq ( l_pac , pac , rmax [ 0 ] , rmax [ 1 ] , & rlen ) ;
2013-02-16 22:48:44 +08:00
if ( rlen ! = rmax [ 1 ] - rmax [ 0 ] ) return ;
2013-02-05 01:37:38 +08:00
2013-02-27 11:55:44 +08:00
srt = malloc ( c - > n * 8 ) ;
for ( i = 0 ; i < c - > n ; + + i )
srt [ i ] = ( uint64_t ) c - > seeds [ i ] . len < < 32 | i ;
ks_introsort_64 ( c - > n , srt ) ;
for ( k = c - > n - 1 ; k > = 0 ; - - k ) {
2013-02-22 03:58:51 +08:00
mem_alnreg_t * a ;
2013-02-27 11:55:44 +08:00
s = & c - > seeds [ ( uint32_t ) srt [ k ] ] ;
for ( i = 0 ; i < av - > n ; + + i ) { // test whether extension has been made before
mem_alnreg_t * p = & av - > a [ i ] ;
int64_t rd ;
2013-02-27 13:29:11 +08:00
int qd , w , max_gap ;
if ( s - > rbeg < p - > rb | | s - > rbeg + s - > len > p - > re | | s - > qbeg < p - > qb | | s - > qbeg + s - > len > p - > qe ) continue ; // not fully contained
// qd: distance ahead of the seed on query; rd: on reference
qd = s - > qbeg - p - > qb ; rd = s - > rbeg - p - > rb ;
max_gap = cal_max_gap ( opt , qd < rd ? qd : rd ) ; // the maximal gap allowed in regions ahead of the seed
w = max_gap < opt - > w ? max_gap : opt - > w ; // bounded by the band width
2013-02-27 11:55:44 +08:00
if ( qd - rd < w & & rd - qd < w ) break ; // the seed is "around" a previous hit
2013-02-27 13:29:11 +08:00
// similar to the previous four lines, but this time we look at the region behind
qd = p - > qe - ( s - > qbeg + s - > len ) ; rd = p - > re - ( s - > rbeg + s - > len ) ;
max_gap = cal_max_gap ( opt , qd < rd ? qd : rd ) ;
w = max_gap < opt - > w ? max_gap : opt - > w ;
if ( qd - rd < w & & rd - qd < w ) break ;
2013-02-27 11:55:44 +08:00
}
if ( i < av - > n ) continue ;
2013-02-22 03:58:51 +08:00
a = kv_pushp ( mem_alnreg_t , * av ) ;
2013-02-08 10:20:36 +08:00
memset ( a , 0 , sizeof ( mem_alnreg_t ) ) ;
2013-02-27 11:55:44 +08:00
2013-02-08 10:20:36 +08:00
if ( s - > qbeg ) { // left extension
uint8_t * rs , * qs ;
2013-02-28 10:13:39 +08:00
int qle , tle , gtle , gscore ;
2013-02-08 10:20:36 +08:00
qs = malloc ( s - > qbeg ) ;
for ( i = 0 ; i < s - > qbeg ; + + i ) qs [ i ] = query [ s - > qbeg - 1 - i ] ;
tmp = s - > rbeg - rmax [ 0 ] ;
rs = malloc ( tmp ) ;
for ( i = 0 ; i < tmp ; + + i ) rs [ i ] = rseq [ tmp - 1 - i ] ;
2013-02-28 10:13:39 +08:00
a - > score = ksw_extend ( s - > qbeg , qs , tmp , rs , 5 , opt - > mat , opt - > q , opt - > r , opt - > w , s - > len * opt - > a , & qle , & tle , & gtle , & gscore ) ;
// check whether we prefer to reach the end of the query
if ( gscore < = 0 | | gscore < = a - > score - opt - > pen_clip ) a - > qb = s - > qbeg - qle , a - > rb = s - > rbeg - tle ; // local hits
else a - > qb = 0 , a - > rb = s - > rbeg - gtle ; // reach the end
2013-02-08 10:20:36 +08:00
free ( qs ) ; free ( rs ) ;
} else a - > score = s - > len * opt - > a , a - > qb = 0 , a - > rb = s - > rbeg ;
2013-02-27 11:55:44 +08:00
if ( s - > qbeg + s - > len ! = l_query ) { // right extension
2013-02-28 10:13:39 +08:00
int qle , tle , qe , re , gtle , gscore ;
2013-02-08 10:20:36 +08:00
qe = s - > qbeg + s - > len ;
re = s - > rbeg + s - > len - rmax [ 0 ] ;
2013-02-28 10:13:39 +08:00
a - > score = ksw_extend ( l_query - qe , query + qe , rmax [ 1 ] - rmax [ 0 ] - re , rseq + re , 5 , opt - > mat , opt - > q , opt - > r , opt - > w , a - > score , & qle , & tle , & gtle , & gscore ) ;
// similar to the above
if ( gscore < = 0 | | gscore < = a - > score - opt - > pen_clip ) a - > qe = qe + qle , a - > re = rmax [ 0 ] + re + tle ;
else a - > qe = l_query , a - > re = rmax [ 0 ] + re + gtle ;
2013-02-08 10:20:36 +08:00
} else a - > qe = l_query , a - > re = s - > rbeg + s - > len ;
2013-02-24 05:57:34 +08:00
if ( bwa_verbose > = 4 ) printf ( " [%d] score=%d \t [%d,%d) <=> [%ld,%ld) \n " , k , a - > score , a - > qb , a - > qe , ( long ) a - > rb , ( long ) a - > re ) ;
2013-02-27 11:55:44 +08:00
2013-02-22 03:58:51 +08:00
// compute seedcov
for ( i = 0 , a - > seedcov = 0 ; i < c - > n ; + + i ) {
const mem_seed_t * t = & c - > seeds [ i ] ;
if ( t - > qbeg > = a - > qb & & t - > qbeg + t - > len < = a - > qe & & t - > rbeg > = a - > rb & & t - > rbeg + t - > len < = a - > re ) // seed fully contained
a - > seedcov + = t - > len ; // this is not very accurate, but for approx. mapQ, this is good enough
}
2013-02-05 04:09:47 +08:00
}
2013-02-27 11:55:44 +08:00
free ( srt ) ; free ( rseq ) ;
2013-02-02 05:39:50 +08:00
}
2013-02-06 10:49:19 +08:00
2013-02-12 04:29:03 +08:00
/*****************************
* Basic hit - > SAM conversion *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2013-02-23 05:38:48 +08:00
void bwa_hit2sam ( kstring_t * str , const int8_t mat [ 25 ] , int q , int r , int w , const bntseq_t * bns , const uint8_t * pac , bseq1_t * s , const bwahit_t * p_ , int is_hard , const bwahit_t * m )
2013-02-12 04:29:03 +08:00
{
2013-02-23 05:38:48 +08:00
# define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
2013-02-27 02:36:01 +08:00
int score , n_cigar , is_rev = 0 , rid , mid , copy_mate = 0 , NM = - 1 ;
2013-02-12 04:29:03 +08:00
uint32_t * cigar = 0 ;
int64_t pos ;
2013-02-23 05:38:48 +08:00
bwahit_t ptmp , * p = & ptmp ;
if ( ! p_ ) { // in this case, generate an unmapped alignment
memset ( & ptmp , 0 , sizeof ( bwahit_t ) ) ;
ptmp . rb = ptmp . re = - 1 ;
} else ptmp = * p_ ;
p - > flag | = m ? 1 : 0 ; // is paired in sequencing
p - > flag | = ! is_mapped ( p ) ? 4 : 0 ; // is mapped
p - > flag | = m & & ! is_mapped ( m ) ? 8 : 0 ; // is mate mapped
if ( m & & ! is_mapped ( p ) & & is_mapped ( m ) ) {
p - > rb = m - > rb ; p - > re = m - > re ; p - > qb = 0 ; p - > qe = s - > l_seq ;
copy_mate = 1 ;
}
p - > flag | = p - > rb > = bns - > l_pac ? 0x10 : 0 ; // is reverse strand
p - > flag | = m & & m - > rb > = bns - > l_pac ? 0x20 : 0 ; // is mate on reverse strand
kputs ( s - > name , str ) ; kputc ( ' \t ' , str ) ;
if ( is_mapped ( p ) ) { // has a coordinate, no matter whether it is mapped or copied from the mate
2013-02-26 00:18:35 +08:00
int sam_flag = p - > flag & 0xff ; // the flag that will be outputed to SAM; it is not always the same as p->flag
2013-02-27 02:14:33 +08:00
if ( p - > flag & 0x10000 ) sam_flag | = 0x100 ;
2013-02-23 05:38:48 +08:00
if ( ! copy_mate ) {
2013-02-27 02:36:01 +08:00
cigar = bwa_gen_cigar ( mat , q , r , w , bns - > l_pac , pac , p - > qe - p - > qb , ( uint8_t * ) & s - > seq [ p - > qb ] , p - > rb , p - > re , & score , & n_cigar , & NM ) ;
2013-02-23 05:38:48 +08:00
p - > flag | = n_cigar = = 0 ? 4 : 0 ; // FIXME: check why this may happen (this has already happened)
} else n_cigar = 0 , cigar = 0 ;
2013-02-12 04:29:03 +08:00
pos = bns_depos ( bns , p - > rb < bns - > l_pac ? p - > rb : p - > re - 1 , & is_rev ) ;
2013-02-24 02:26:50 +08:00
bns_cnt_ambi ( bns , pos , p - > re - p - > rb , & rid ) ;
2013-02-26 00:18:35 +08:00
kputw ( sam_flag , str ) ; kputc ( ' \t ' , str ) ;
2013-02-12 04:29:03 +08:00
kputs ( bns - > anns [ rid ] . name , str ) ; kputc ( ' \t ' , str ) ; kputuw ( pos - bns - > anns [ rid ] . offset + 1 , str ) ; kputc ( ' \t ' , str ) ;
kputw ( p - > qual , str ) ; kputc ( ' \t ' , str ) ;
if ( n_cigar ) {
int i , clip5 , clip3 ;
clip5 = is_rev ? s - > l_seq - p - > qe : p - > qb ;
clip3 = is_rev ? p - > qb : s - > l_seq - p - > qe ;
if ( clip5 ) { kputw ( clip5 , str ) ; kputc ( " SH " [ ( is_hard ! = 0 ) ] , str ) ; }
for ( i = 0 ; i < n_cigar ; + + i ) {
kputw ( cigar [ i ] > > 4 , str ) ; kputc ( " MIDSH " [ cigar [ i ] & 0xf ] , str ) ;
}
if ( clip3 ) { kputw ( clip3 , str ) ; kputc ( " SH " [ ( is_hard ! = 0 ) ] , str ) ; }
} else kputc ( ' * ' , str ) ;
2013-02-23 05:38:48 +08:00
} else { // no coordinate
kputw ( p - > flag , str ) ;
kputs ( " \t * \t 0 \t 0 \t * " , str ) ;
rid = - 1 ;
2013-02-12 04:29:03 +08:00
}
2013-02-23 05:38:48 +08:00
if ( m & & is_mapped ( m ) ) { // then print mate pos and isize
pos = bns_depos ( bns , m - > rb < bns - > l_pac ? m - > rb : m - > re - 1 , & is_rev ) ;
2013-02-24 02:26:50 +08:00
bns_cnt_ambi ( bns , pos , m - > re - m - > rb , & mid ) ;
2013-02-23 05:38:48 +08:00
kputc ( ' \t ' , str ) ;
if ( mid = = rid ) kputc ( ' = ' , str ) ;
else kputs ( bns - > anns [ mid ] . name , str ) ;
kputc ( ' \t ' , str ) ; kputuw ( pos - bns - > anns [ mid ] . offset + 1 , str ) ;
kputc ( ' \t ' , str ) ;
if ( mid = = rid ) {
int64_t p0 = p - > rb < bns - > l_pac ? p - > rb : ( bns - > l_pac < < 1 ) - 1 - p - > rb ;
int64_t p1 = m - > rb < bns - > l_pac ? m - > rb : ( bns - > l_pac < < 1 ) - 1 - m - > rb ;
2013-02-25 12:00:51 +08:00
kputw ( p0 - p1 + ( p0 > p1 ? 1 : - 1 ) , str ) ;
2013-02-23 05:38:48 +08:00
} else kputw ( 0 , str ) ;
kputc ( ' \t ' , str ) ;
} else kputsn ( " \t * \t 0 \t 0 \t " , 7 , str ) ;
2013-02-25 02:23:43 +08:00
if ( p - > flag & 0x100 ) { // for secondary alignments, don't write SEQ and QUAL
kputsn ( " * \t * " , 3 , str ) ;
} else if ( ! ( p - > flag & 0x10 ) ) { // print SEQ and QUAL, the forward strand
2013-02-12 04:29:03 +08:00
int i , qb = 0 , qe = s - > l_seq ;
2013-02-23 05:38:48 +08:00
if ( ! ( p - > flag & 4 ) & & is_hard ) qb = p - > qb , qe = p - > qe ;
2013-02-12 04:29:03 +08:00
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qb ; i < qe ; + + i ) str - > s [ str - > l + + ] = " ACGTN " [ ( int ) s - > seq [ i ] ] ;
kputc ( ' \t ' , str ) ;
if ( s - > qual ) { // printf qual
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qb ; i < qe ; + + i ) str - > s [ str - > l + + ] = s - > qual [ i ] ;
str - > s [ str - > l ] = 0 ;
} else kputc ( ' * ' , str ) ;
} else { // the reverse strand
int i , qb = 0 , qe = s - > l_seq ;
2013-02-23 05:38:48 +08:00
if ( ! ( p - > flag & 4 ) & & is_hard ) qb = p - > qb , qe = p - > qe ;
2013-02-12 04:29:03 +08:00
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qe - 1 ; i > = qb ; - - i ) str - > s [ str - > l + + ] = " TGCAN " [ ( int ) s - > seq [ i ] ] ;
kputc ( ' \t ' , str ) ;
if ( s - > qual ) { // printf qual
ks_resize ( str , str - > l + ( qe - qb ) + 1 ) ;
for ( i = qe - 1 ; i > = qb ; - - i ) str - > s [ str - > l + + ] = s - > qual [ i ] ;
str - > s [ str - > l ] = 0 ;
} else kputc ( ' * ' , str ) ;
}
2013-02-27 02:36:01 +08:00
if ( NM > = 0 ) { kputsn ( " \t NM:i: " , 6 , str ) ; kputw ( NM , str ) ; }
2013-02-23 05:38:48 +08:00
if ( p - > score > = 0 ) { kputsn ( " \t AS:i: " , 6 , str ) ; kputw ( p - > score , str ) ; }
if ( p - > sub > = 0 ) { kputsn ( " \t XS:i: " , 6 , str ) ; kputw ( p - > sub , str ) ; }
2013-02-27 02:03:35 +08:00
if ( bwa_rg_id [ 0 ] ) { kputsn ( " \t RG:Z: " , 6 , str ) ; kputs ( bwa_rg_id , str ) ; }
2013-02-24 05:57:34 +08:00
if ( s - > comment ) { kputc ( ' \t ' , str ) ; kputs ( s - > comment , str ) ; }
2013-02-12 04:29:03 +08:00
kputc ( ' \n ' , str ) ;
free ( cigar ) ;
2013-02-23 05:38:48 +08:00
# undef is_mapped
2013-02-12 04:29:03 +08:00
}
2013-02-08 03:36:18 +08:00
/************************
* Integrated interface *
* * * * * * * * * * * * * * * * * * * * * * * */
2013-02-19 13:50:39 +08:00
int mem_approx_mapq_se ( const mem_opt_t * opt , const mem_alnreg_t * a )
2013-02-09 02:43:15 +08:00
{
int mapq , l , sub = a - > sub ? a - > sub : opt - > min_seed_len * opt - > a ;
double identity ;
2013-02-09 03:46:57 +08:00
sub = a - > csub > sub ? a - > csub : sub ;
2013-02-23 03:47:57 +08:00
if ( sub > = a - > score ) return 0 ;
2013-02-09 02:43:15 +08:00
l = a - > qe - a - > qb > a - > re - a - > rb ? a - > qe - a - > qb : a - > re - a - > rb ;
2013-02-13 05:15:26 +08:00
mapq = a - > score ? ( int ) ( MEM_MAPQ_COEF * ( 1. - ( double ) sub / a - > score ) * log ( a - > seedcov ) + .499 ) : 0 ;
2013-02-09 02:43:15 +08:00
identity = 1. - ( double ) ( l * opt - > a - a - > score ) / ( opt - > a + opt - > b ) / l ;
mapq = identity < 0.95 ? ( int ) ( mapq * identity * identity + .499 ) : mapq ;
2013-02-26 02:00:35 +08:00
if ( a - > sub_n > 0 ) mapq - = ( int ) ( 4.343 * log ( a - > sub_n + 1 ) + .499 ) ;
2013-02-09 02:43:15 +08:00
if ( mapq > 60 ) mapq = 60 ;
2013-02-09 06:20:44 +08:00
if ( mapq < 0 ) mapq = 0 ;
2013-02-09 02:43:15 +08:00
return mapq ;
}
2013-02-13 01:09:36 +08:00
void mem_alnreg2hit ( const mem_alnreg_t * a , bwahit_t * h )
{
h - > rb = a - > rb ; h - > re = a - > re ; h - > qb = a - > qb ; h - > qe = a - > qe ;
h - > score = a - > score ;
2013-02-25 02:23:43 +08:00
h - > sub = a - > secondary > = 0 ? - 1 : a - > sub > a - > csub ? a - > sub : a - > csub ;
2013-02-13 04:52:23 +08:00
h - > qual = 0 ; // quality unset
2013-02-19 05:33:06 +08:00
h - > flag = a - > secondary > = 0 ? 0x100 : 0 ; // only the "secondary" bit is set
2013-02-13 01:09:36 +08:00
}
2013-02-23 05:38:48 +08:00
void mem_sam_se ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , bseq1_t * s , mem_alnreg_v * a , int extra_flag , const bwahit_t * m )
2013-02-08 02:13:43 +08:00
{
2013-02-12 04:29:03 +08:00
int k ;
2013-02-08 03:36:18 +08:00
kstring_t str ;
str . l = str . m = 0 ; str . s = 0 ;
2013-02-12 04:29:03 +08:00
if ( a - > n > 0 ) {
2013-02-25 23:54:12 +08:00
int mapq0 = - 1 ;
2013-02-12 04:29:03 +08:00
for ( k = 0 ; k < a - > n ; + + k ) {
bwahit_t h ;
2013-02-26 00:18:35 +08:00
mem_alnreg_t * p = & a - > a [ k ] ;
if ( p - > secondary > = 0 & & ! ( opt - > flag & MEM_F_ALL ) ) continue ;
if ( p - > secondary > = 0 & & p - > score < a - > a [ p - > secondary ] . score * .5 ) continue ;
mem_alnreg2hit ( p , & h ) ;
2013-02-26 13:03:49 +08:00
bwa_fix_xref ( opt - > mat , opt - > q , opt - > r , opt - > w , bns , pac , ( uint8_t * ) s - > seq , & h . qb , & h . qe , & h . rb , & h . re ) ;
2013-02-19 05:33:06 +08:00
h . flag | = extra_flag ;
2013-02-26 00:18:35 +08:00
if ( ( opt - > flag & MEM_F_NO_MULTI ) & & k & & p - > secondary < 0 ) h . flag | = 0x10000 ; // print the sequence, but flag as secondary (for Picard)
h . qual = p - > secondary > = 0 ? 0 : mem_approx_mapq_se ( opt , p ) ;
2013-02-25 23:54:12 +08:00
if ( k = = 0 ) mapq0 = h . qual ;
else if ( h . qual > mapq0 ) h . qual = mapq0 ;
2013-02-23 05:38:48 +08:00
bwa_hit2sam ( & str , opt - > mat , opt - > q , opt - > r , opt - > w , bns , pac , s , & h , opt - > flag & MEM_F_HARDCLIP , m ) ;
2013-02-12 04:29:03 +08:00
}
2013-02-23 05:38:48 +08:00
} else bwa_hit2sam ( & str , opt - > mat , opt - > q , opt - > r , opt - > w , bns , pac , s , 0 , opt - > flag & MEM_F_HARDCLIP , m ) ;
2013-02-08 03:36:18 +08:00
s - > sam = str . s ;
}
2013-02-25 02:09:29 +08:00
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , char * seq )
2013-02-08 03:36:18 +08:00
{
2013-02-27 11:55:44 +08:00
int i ;
2013-02-08 03:36:18 +08:00
mem_chain_v chn ;
2013-02-27 11:55:44 +08:00
mem_alnreg_v regs ;
for ( i = 0 ; i < l_seq ; + + i ) // convert to 2-bit encoding if we have not done so
2013-02-25 02:09:29 +08:00
seq [ i ] = seq [ i ] < 4 ? seq [ i ] : nst_nt4_table [ ( int ) seq [ i ] ] ;
2013-02-27 11:55:44 +08:00
2013-02-25 02:09:29 +08:00
chn = mem_chain ( opt , bwt , l_seq , ( uint8_t * ) seq ) ;
2013-02-08 03:36:18 +08:00
chn . n = mem_chain_flt ( opt , chn . n , chn . a ) ;
2013-02-24 05:57:34 +08:00
if ( bwa_verbose > = 4 ) mem_print_chain ( bns , & chn ) ;
2013-02-27 11:55:44 +08:00
kv_init ( regs ) ;
2013-02-08 03:36:18 +08:00
for ( i = 0 ; i < chn . n ; + + i ) {
2013-02-27 05:26:46 +08:00
mem_chain_t * p = & chn . a [ i ] ;
2013-02-27 11:55:44 +08:00
mem_chain2aln ( opt , bns - > l_pac , pac , l_seq , ( uint8_t * ) seq , p , & regs ) ;
2013-02-08 03:36:18 +08:00
free ( chn . a [ i ] . seeds ) ;
}
2013-02-27 11:55:44 +08:00
free ( chn . a ) ;
2013-02-22 03:58:51 +08:00
regs . n = mem_sort_and_dedup ( regs . n , regs . a ) ;
2013-02-08 03:36:18 +08:00
return regs ;
2013-02-08 02:13:43 +08:00
}
2013-02-08 02:29:01 +08:00
typedef struct {
int start , step , n ;
const mem_opt_t * opt ;
const bwt_t * bwt ;
const bntseq_t * bns ;
const uint8_t * pac ;
2013-02-12 04:29:03 +08:00
const mem_pestat_t * pes ;
2013-02-08 02:29:01 +08:00
bseq1_t * seqs ;
2013-02-08 03:36:18 +08:00
mem_alnreg_v * regs ;
} worker_t ;
2013-02-08 02:29:01 +08:00
static void * worker1 ( void * data )
{
2013-02-08 03:36:18 +08:00
worker_t * w = ( worker_t * ) data ;
2013-02-08 02:29:01 +08:00
int i ;
2013-02-26 04:40:15 +08:00
if ( ! ( w - > opt - > flag & MEM_F_PE ) ) {
for ( i = w - > start ; i < w - > n ; i + = w - > step )
w - > regs [ i ] = mem_align1 ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i ] . l_seq , w - > seqs [ i ] . seq ) ;
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
for ( i = w - > start ; i < w - > n > > 1 ; i + = w - > step ) {
w - > regs [ i < < 1 | 0 ] = mem_align1 ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i < < 1 | 0 ] . l_seq , w - > seqs [ i < < 1 | 0 ] . seq ) ;
w - > regs [ i < < 1 | 1 ] = mem_align1 ( w - > opt , w - > bwt , w - > bns , w - > pac , w - > seqs [ i < < 1 | 1 ] . l_seq , w - > seqs [ i < < 1 | 1 ] . seq ) ;
}
}
2013-02-08 03:36:18 +08:00
return 0 ;
}
static void * worker2 ( void * data )
{
2013-02-19 05:33:06 +08:00
extern int mem_sam_pe ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , const mem_pestat_t pes [ 4 ] , uint64_t id , bseq1_t s [ 2 ] , mem_alnreg_v a [ 2 ] ) ;
2013-02-08 03:36:18 +08:00
worker_t * w = ( worker_t * ) data ;
int i ;
2013-02-19 05:33:06 +08:00
if ( ! ( w - > opt - > flag & MEM_F_PE ) ) {
2013-02-24 03:48:54 +08:00
for ( i = w - > start ; i < w - > n ; i + = w - > step ) {
2013-02-19 05:33:06 +08:00
mem_mark_primary_se ( w - > opt , w - > regs [ i ] . n , w - > regs [ i ] . a ) ;
2013-02-23 05:38:48 +08:00
mem_sam_se ( w - > opt , w - > bns , w - > pac , & w - > seqs [ i ] , & w - > regs [ i ] , 0 , 0 ) ;
2013-02-08 03:36:18 +08:00
free ( w - > regs [ i ] . a ) ;
}
} else {
2013-02-17 00:03:27 +08:00
int n = 0 ;
2013-02-24 03:48:54 +08:00
for ( i = w - > start ; i < w - > n > > 1 ; i + = w - > step ) { // not implemented yet
2013-02-19 05:33:06 +08:00
n + = mem_sam_pe ( w - > opt , w - > bns , w - > pac , w - > pes , i , & w - > seqs [ i < < 1 ] , & w - > regs [ i < < 1 ] ) ;
2013-02-08 03:36:18 +08:00
free ( w - > regs [ i < < 1 | 0 ] . a ) ; free ( w - > regs [ i < < 1 | 1 ] . a ) ;
}
2013-02-17 00:03:27 +08:00
fprintf ( stderr , " [M::%s@%d] performed mate-SW for %d reads \n " , __func__ , w - > start , n ) ;
2013-02-08 03:36:18 +08:00
}
2013-02-08 02:29:01 +08:00
return 0 ;
}
2013-02-25 02:09:29 +08:00
void mem_process_seqs ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int n , bseq1_t * seqs )
2013-02-08 02:13:43 +08:00
{
int i ;
2013-02-08 03:36:18 +08:00
worker_t * w ;
2013-02-08 03:57:22 +08:00
mem_alnreg_v * regs ;
2013-02-11 23:59:38 +08:00
mem_pestat_t pes [ 4 ] ;
2013-02-08 03:36:18 +08:00
w = calloc ( opt - > n_threads , sizeof ( worker_t ) ) ;
2013-02-08 03:57:22 +08:00
regs = malloc ( n * sizeof ( mem_alnreg_v ) ) ;
2013-02-08 02:29:01 +08:00
for ( i = 0 ; i < opt - > n_threads ; + + i ) {
2013-02-08 03:57:22 +08:00
worker_t * p = & w [ i ] ;
p - > start = i ; p - > step = opt - > n_threads ; p - > n = n ;
p - > opt = opt ; p - > bwt = bwt ; p - > bns = bns ; p - > pac = pac ;
p - > seqs = seqs ; p - > regs = regs ;
2013-02-11 23:59:38 +08:00
p - > pes = & pes [ 0 ] ;
2013-02-08 02:29:01 +08:00
}
# ifdef HAVE_PTHREAD
if ( opt - > n_threads = = 1 ) {
2013-02-11 23:59:38 +08:00
worker1 ( w ) ;
2013-02-19 05:33:06 +08:00
if ( opt - > flag & MEM_F_PE ) mem_pestat ( opt , bns - > l_pac , n , regs , pes ) ;
2013-02-11 23:59:38 +08:00
worker2 ( w ) ;
2013-02-08 02:29:01 +08:00
} else {
pthread_t * tid ;
tid = ( pthread_t * ) calloc ( opt - > n_threads , sizeof ( pthread_t ) ) ;
2013-02-08 03:36:18 +08:00
for ( i = 0 ; i < opt - > n_threads ; + + i ) pthread_create ( & tid [ i ] , 0 , worker1 , & w [ i ] ) ;
for ( i = 0 ; i < opt - > n_threads ; + + i ) pthread_join ( tid [ i ] , 0 ) ;
2013-02-19 05:33:06 +08:00
if ( opt - > flag & MEM_F_PE ) mem_pestat ( opt , bns - > l_pac , n , regs , pes ) ;
2013-02-08 03:36:18 +08:00
for ( i = 0 ; i < opt - > n_threads ; + + i ) pthread_create ( & tid [ i ] , 0 , worker2 , & w [ i ] ) ;
2013-02-08 02:29:01 +08:00
for ( i = 0 ; i < opt - > n_threads ; + + i ) pthread_join ( tid [ i ] , 0 ) ;
free ( tid ) ;
}
# else
2013-02-11 23:59:38 +08:00
worker1 ( w ) ;
2013-02-19 05:33:06 +08:00
if ( opt - > flag & MEM_F_PE ) mem_pestat ( opt , bns - > l_pac , n , regs , pes ) ;
2013-02-11 23:59:38 +08:00
worker2 ( w ) ;
2013-02-08 02:29:01 +08:00
# endif
2013-02-08 03:36:18 +08:00
for ( i = 0 ; i < n ; + + i ) {
2013-02-08 03:57:22 +08:00
fputs ( seqs [ i ] . sam , stdout ) ;
2013-02-08 03:36:18 +08:00
free ( seqs [ i ] . name ) ; free ( seqs [ i ] . comment ) ; free ( seqs [ i ] . seq ) ; free ( seqs [ i ] . qual ) ; free ( seqs [ i ] . sam ) ;
}
2013-02-08 03:57:22 +08:00
free ( regs ) ; free ( w ) ;
2013-02-08 02:13:43 +08:00
}