2017-06-30 07:35:38 +08:00
# include <string.h>
2017-09-19 07:49:15 +08:00
# include <stdlib.h>
2017-06-30 07:35:38 +08:00
# include <math.h>
# include "mmpriv.h"
# include "kalloc.h"
2017-10-05 01:05:18 +08:00
# include "khash.h"
2017-06-30 07:35:38 +08:00
2017-07-12 02:09:51 +08:00
static inline void mm_cal_fuzzy_len ( mm_reg1_t * r , const mm128_t * a )
2017-07-03 22:44:26 +08:00
{
2017-07-12 02:09:51 +08:00
int i ;
2017-10-16 22:55:18 +08:00
r - > mlen = r - > blen = 0 ;
2017-07-12 02:09:51 +08:00
if ( r - > cnt < = 0 ) return ;
2017-10-16 22:55:18 +08:00
r - > mlen = r - > blen = a [ r - > as ] . y > > 32 & 0xff ;
2017-07-12 02:09:51 +08:00
for ( i = r - > as + 1 ; i < r - > as + r - > cnt ; + + i ) {
int span = a [ i ] . y > > 32 & 0xff ;
int tl = ( int32_t ) a [ i ] . x - ( int32_t ) a [ i - 1 ] . x ;
int ql = ( int32_t ) a [ i ] . y - ( int32_t ) a [ i - 1 ] . y ;
2017-10-16 22:55:18 +08:00
r - > blen + = tl > ql ? tl : ql ;
r - > mlen + = tl > span & & ql > span ? span : tl < ql ? tl : ql ;
2017-07-12 02:09:51 +08:00
}
}
static inline void mm_reg_set_coor ( mm_reg1_t * r , int32_t qlen , const mm128_t * a )
{ // NB: r->as and r->cnt MUST BE set correctly for this function to work
2017-07-03 22:44:26 +08:00
int32_t k = r - > as , q_span = ( int32_t ) ( a [ k ] . y > > 32 & 0xff ) ;
r - > rev = a [ k ] . x > > 63 ;
r - > rid = a [ k ] . x < < 1 > > 33 ;
r - > rs = ( int32_t ) a [ k ] . x + 1 > q_span ? ( int32_t ) a [ k ] . x + 1 - q_span : 0 ; // NB: target span may be shorter, so this test is necessary
r - > re = ( int32_t ) a [ k + r - > cnt - 1 ] . x + 1 ;
if ( ! r - > rev ) {
r - > qs = ( int32_t ) a [ k ] . y + 1 - q_span ;
r - > qe = ( int32_t ) a [ k + r - > cnt - 1 ] . y + 1 ;
} else {
r - > qs = qlen - ( ( int32_t ) a [ k + r - > cnt - 1 ] . y + 1 ) ;
r - > qe = qlen - ( ( int32_t ) a [ k ] . y + 1 - q_span ) ;
}
2017-07-12 02:09:51 +08:00
mm_cal_fuzzy_len ( r , a ) ;
2017-07-03 22:44:26 +08:00
}
2017-10-05 01:05:18 +08:00
static inline uint64_t hash64 ( uint64_t key )
{
key = ( ~ key + ( key < < 21 ) ) ;
key = key ^ key > > 24 ;
key = ( ( key + ( key < < 3 ) ) + ( key < < 8 ) ) ;
key = key ^ key > > 14 ;
key = ( ( key + ( key < < 2 ) ) + ( key < < 4 ) ) ;
key = key ^ key > > 28 ;
key = ( key + ( key < < 31 ) ) ;
return key ;
}
mm_reg1_t * mm_gen_regs ( void * km , uint32_t hash , int qlen , int n_u , uint64_t * u , mm128_t * a ) // convert chains to hits
2017-06-30 07:35:38 +08:00
{
2017-07-01 00:52:28 +08:00
mm128_t * z , tmp ;
2017-06-30 07:35:38 +08:00
mm_reg1_t * r ;
int i , k ;
2017-07-01 00:52:28 +08:00
if ( n_u = = 0 ) return 0 ;
// sort by score
z = ( mm128_t * ) kmalloc ( km , n_u * 16 ) ;
2017-06-30 07:35:38 +08:00
for ( i = k = 0 ; i < n_u ; + + i ) {
2017-10-05 01:05:18 +08:00
uint32_t h ;
h = ( uint32_t ) hash64 ( ( hash64 ( a [ k ] . x ) + hash64 ( a [ k ] . y ) ) ^ hash ) ;
z [ i ] . x = u [ i ] ^ h ; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
2017-07-01 00:52:28 +08:00
z [ i ] . y = ( uint64_t ) k < < 32 | ( int32_t ) u [ i ] ;
k + = ( int32_t ) u [ i ] ;
}
radix_sort_128x ( z , z + n_u ) ;
for ( i = 0 ; i < n_u > > 1 ; + + i ) // reverse, s.t. larger score first
tmp = z [ i ] , z [ i ] = z [ n_u - 1 - i ] , z [ n_u - 1 - i ] = tmp ;
// populate r[]
r = ( mm_reg1_t * ) calloc ( n_u , sizeof ( mm_reg1_t ) ) ;
for ( i = 0 ; i < n_u ; + + i ) {
2017-06-30 07:35:38 +08:00
mm_reg1_t * ri = & r [ i ] ;
ri - > id = i ;
ri - > parent = MM_PARENT_UNSET ;
2017-10-05 01:05:18 +08:00
ri - > score = z [ i ] . x > > 32 ;
ri - > hash = ( uint32_t ) z [ i ] . x ;
2017-07-01 00:52:28 +08:00
ri - > cnt = ( int32_t ) z [ i ] . y ;
ri - > as = z [ i ] . y > > 32 ;
2017-06-30 07:35:38 +08:00
mm_reg_set_coor ( ri , qlen , a ) ;
}
2017-07-01 00:52:28 +08:00
kfree ( km , z ) ;
2017-06-30 07:35:38 +08:00
return r ;
}
2017-06-30 07:44:11 +08:00
void mm_split_reg ( mm_reg1_t * r , mm_reg1_t * r2 , int n , int qlen , mm128_t * a )
{
if ( n < = 0 | | n > = r - > cnt ) return ;
* r2 = * r ;
r2 - > id = - 1 ;
2017-06-30 11:48:35 +08:00
r2 - > sam_pri = 0 ;
2017-06-30 07:44:11 +08:00
r2 - > p = 0 ;
r2 - > cnt = r - > cnt - n ;
r2 - > score = ( int32_t ) ( r - > score * ( ( float ) r2 - > cnt / r - > cnt ) + .499 ) ;
r2 - > as = r - > as + n ;
if ( r - > parent = = r - > id ) r2 - > parent = MM_PARENT_TMP_PRI ;
mm_reg_set_coor ( r2 , qlen , a ) ;
r - > cnt - = r2 - > cnt ;
r - > score - = r2 - > score ;
mm_reg_set_coor ( r , qlen , a ) ;
2017-06-30 10:08:46 +08:00
r - > split | = 1 , r2 - > split | = 2 ;
2017-06-30 07:44:11 +08:00
}
2017-09-15 00:37:03 +08:00
void mm_set_parent ( void * km , float mask_level , int n , mm_reg1_t * r , int sub_diff ) // and compute mm_reg1_t::subsc
2017-06-30 07:35:38 +08:00
{
int i , j , k , * w ;
2017-10-17 01:58:01 +08:00
uint64_t * cov ;
2017-06-30 07:35:38 +08:00
if ( n < = 0 ) return ;
2017-07-03 22:44:26 +08:00
for ( i = 0 ; i < n ; + + i ) r [ i ] . id = i ;
2017-10-17 01:58:01 +08:00
cov = ( uint64_t * ) kmalloc ( km , n * sizeof ( uint64_t ) ) ;
2017-06-30 07:35:38 +08:00
w = ( int * ) kmalloc ( km , n * sizeof ( int ) ) ;
w [ 0 ] = 0 , r [ 0 ] . parent = 0 ;
for ( i = 1 , k = 1 ; i < n ; + + i ) {
2017-07-03 22:44:26 +08:00
mm_reg1_t * ri = & r [ i ] ;
2017-10-17 01:58:01 +08:00
int si = ri - > qs , ei = ri - > qe , n_cov = 0 , uncov_len = 0 ;
for ( j = 0 ; j < k ; + + j ) { // traverse existing primary hits to find overlapping hits
2017-07-03 22:44:26 +08:00
mm_reg1_t * rp = & r [ w [ j ] ] ;
int sj = rp - > qs , ej = rp - > qe ;
2017-10-17 01:58:01 +08:00
if ( ej < = si | | sj > = ei ) continue ;
if ( sj < si ) sj = si ;
if ( ej > ei ) ej = ei ;
cov [ n_cov + + ] = ( uint64_t ) sj < < 32 | ej ;
}
if ( n_cov = = 0 ) {
goto set_parent_test ; // no overlapping primary hits; then i is a new primary hit
} else if ( n_cov > 0 ) { // there are overlapping primary hits; find the length not covered by existing primary hits
int j , x = si ;
radix_sort_64 ( cov , cov + n_cov ) ;
for ( j = 0 ; j < n_cov ; + + j ) {
if ( cov [ j ] > > 32 > x ) uncov_len + = ( cov [ j ] > > 32 ) - x ;
x = ( int32_t ) cov [ j ] > x ? ( int32_t ) cov [ j ] : x ;
}
if ( ei > x ) uncov_len + = ei - x ;
}
for ( j = 0 ; j < k ; + + j ) { // traverse existing primary hits again
mm_reg1_t * rp = & r [ w [ j ] ] ;
int sj = rp - > qs , ej = rp - > qe , min , max , ol ;
if ( ej < = si | | sj > = ei ) continue ; // no overlap
min = ej - sj < ei - si ? ej - sj : ei - si ;
max = ej - sj > ei - si ? ej - sj : ei - si ;
ol = si < sj ? ( ei < sj ? 0 : ei < ej ? ei - sj : ej - sj ) : ( ej < si ? 0 : ej < ei ? ej - si : ei - si ) ; // overlap length
if ( ( float ) ol / min - ( float ) uncov_len / max > mask_level ) {
2017-09-15 00:37:03 +08:00
int cnt_sub = 0 ;
2017-07-03 22:44:26 +08:00
ri - > parent = rp - > parent ;
rp - > subsc = rp - > subsc > ri - > score ? rp - > subsc : ri - > score ;
2017-09-15 00:37:03 +08:00
if ( ri - > cnt > = rp - > cnt ) cnt_sub = 1 ;
2017-10-10 23:59:44 +08:00
if ( rp - > p & & ri - > p & & ( rp - > rs ! = ri - > rs | | rp - > re ! = ri - > re | | ol ! = min ) ) { // the last condition excludes identical hits after DP
2017-07-03 22:44:26 +08:00
rp - > p - > dp_max2 = rp - > p - > dp_max2 > ri - > p - > dp_max ? rp - > p - > dp_max2 : ri - > p - > dp_max ;
2017-09-15 00:37:03 +08:00
if ( rp - > p - > dp_max - ri - > p - > dp_max < = sub_diff ) cnt_sub = 1 ;
}
if ( cnt_sub ) + + rp - > n_sub ;
2017-06-30 07:35:38 +08:00
break ;
}
}
2017-10-17 01:58:01 +08:00
set_parent_test :
2017-09-14 03:32:39 +08:00
if ( j = = k ) w [ k + + ] = i , ri - > parent = i , ri - > n_sub = 0 ;
2017-06-30 07:35:38 +08:00
}
2017-10-17 01:58:01 +08:00
kfree ( km , cov ) ;
2017-06-30 07:35:38 +08:00
kfree ( km , w ) ;
}
2017-07-03 22:44:26 +08:00
void mm_hit_sort_by_dp ( void * km , int * n_regs , mm_reg1_t * r )
2017-07-01 03:23:30 +08:00
{
2017-07-03 22:44:26 +08:00
int32_t i , n_aux , n = * n_regs ;
2017-10-05 01:05:18 +08:00
mm128_t * aux ;
2017-07-03 22:44:26 +08:00
mm_reg1_t * t ;
if ( n < = 1 ) return ;
2017-10-05 01:05:18 +08:00
aux = ( mm128_t * ) kmalloc ( km , n * 16 ) ;
2017-07-03 22:44:26 +08:00
t = ( mm_reg1_t * ) kmalloc ( km , n * sizeof ( mm_reg1_t ) ) ;
for ( i = n_aux = 0 ; i < n ; + + i ) {
2017-07-30 01:09:10 +08:00
if ( r [ i ] . inv | | r [ i ] . cnt > 0 ) { // squeeze out elements with cnt==0 (soft deleted)
2017-07-03 22:44:26 +08:00
assert ( r [ i ] . p ) ;
2017-10-05 01:05:18 +08:00
aux [ n_aux ] . x = ( uint64_t ) r [ i ] . p - > dp_max < < 32 | r [ i ] . hash ;
aux [ n_aux + + ] . y = i ;
2017-07-03 22:44:26 +08:00
} else if ( r [ i ] . p ) {
2017-07-30 01:09:10 +08:00
free ( r [ i ] . p ) ;
2017-07-03 22:44:26 +08:00
r [ i ] . p = 0 ;
2017-07-03 10:43:02 +08:00
}
}
2017-10-05 01:05:18 +08:00
radix_sort_128x ( aux , aux + n_aux ) ;
2017-07-03 22:44:26 +08:00
for ( i = n_aux - 1 ; i > = 0 ; - - i )
2017-10-05 01:05:18 +08:00
t [ n_aux - 1 - i ] = r [ aux [ i ] . y ] ;
2017-07-03 22:44:26 +08:00
memcpy ( r , t , sizeof ( mm_reg1_t ) * n_aux ) ;
* n_regs = n_aux ;
kfree ( km , aux ) ;
kfree ( km , t ) ;
2017-07-01 03:23:30 +08:00
}
2017-07-04 01:18:57 +08:00
int mm_set_sam_pri ( int n , mm_reg1_t * r )
{
int i , n_pri = 0 ;
for ( i = 0 ; i < n ; + + i )
if ( r [ i ] . id = = r [ i ] . parent ) {
+ + n_pri ;
r [ i ] . sam_pri = ( n_pri = = 1 ) ;
} else r [ i ] . sam_pri = 0 ;
return n_pri ;
}
2017-06-30 07:35:38 +08:00
void mm_sync_regs ( void * km , int n_regs , mm_reg1_t * regs ) / / keep mm_reg1_t : : { id , parent } in sync ; also reset id
{
2017-07-04 01:18:57 +08:00
int * tmp , i , max_id = - 1 , n_tmp ;
2017-06-30 07:35:38 +08:00
if ( n_regs < = 0 ) return ;
2017-07-03 23:36:34 +08:00
for ( i = 0 ; i < n_regs ; + + i ) // NB: doesn't work if mm_reg1_t::id is negative
2017-06-30 07:35:38 +08:00
max_id = max_id > regs [ i ] . id ? max_id : regs [ i ] . id ;
n_tmp = max_id + 1 ;
tmp = ( int * ) kmalloc ( km , n_tmp * sizeof ( int ) ) ;
for ( i = 0 ; i < n_tmp ; + + i ) tmp [ i ] = - 1 ;
for ( i = 0 ; i < n_regs ; + + i )
if ( regs [ i ] . id > = 0 ) tmp [ regs [ i ] . id ] = i ;
2017-07-04 01:18:57 +08:00
for ( i = 0 ; i < n_regs ; + + i ) {
2017-06-30 07:35:38 +08:00
mm_reg1_t * r = & regs [ i ] ;
r - > id = i ;
if ( r - > parent = = MM_PARENT_TMP_PRI )
r - > parent = i ;
else if ( r - > parent > = 0 & & tmp [ r - > parent ] > = 0 )
r - > parent = tmp [ r - > parent ] ;
else r - > parent = MM_PARENT_UNSET ;
}
kfree ( km , tmp ) ;
2017-07-04 01:18:57 +08:00
mm_set_sam_pri ( n_regs , regs ) ;
2017-06-30 07:35:38 +08:00
}
2017-09-26 23:03:36 +08:00
void mm_select_sub ( void * km , float pri_ratio , int min_diff , int best_n , int * n_ , mm_reg1_t * r )
2017-06-30 07:35:38 +08:00
{
if ( pri_ratio > 0.0f & & * n_ > 0 ) {
2017-07-03 07:08:30 +08:00
int i , k , n = * n_ , n_2nd = 0 ;
2017-10-10 23:59:44 +08:00
for ( i = k = 0 ; i < n ; + + i ) {
int p = r [ i ] . parent ;
if ( p = = i | | r [ i ] . inv ) { // primary or inversion
2017-07-26 08:02:44 +08:00
r [ k + + ] = r [ i ] ;
2017-10-10 23:59:44 +08:00
} else if ( ( r [ i ] . score > = r [ p ] . score * pri_ratio | | r [ i ] . score + min_diff > = r [ p ] . score ) & & n_2nd < best_n ) {
if ( ! ( r [ i ] . qs = = r [ p ] . qs & & r [ i ] . qe = = r [ p ] . qe & & r [ i ] . rs = = r [ p ] . rs & & r [ i ] . re = = r [ p ] . re ) ) // not identical hits
r [ k + + ] = r [ i ] , + + n_2nd ;
2017-10-12 22:12:22 +08:00
else if ( r [ i ] . p ) free ( r [ i ] . p ) ;
2017-10-10 23:59:44 +08:00
} else if ( r [ i ] . p ) free ( r [ i ] . p ) ;
}
2017-06-30 07:35:38 +08:00
if ( k ! = n ) mm_sync_regs ( km , k , r ) ; // removing hits requires sync()
* n_ = k ;
}
}
2017-07-01 10:15:45 +08:00
void mm_filter_regs ( void * km , const mm_mapopt_t * opt , int * n_regs , mm_reg1_t * regs )
2017-07-03 07:56:21 +08:00
{ // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
2017-07-01 10:15:45 +08:00
int i , k ;
2017-07-03 22:44:26 +08:00
for ( i = k = 0 ; i < * n_regs ; + + i ) {
2017-07-01 10:15:45 +08:00
mm_reg1_t * r = & regs [ i ] ;
int flt = 0 ;
2017-09-27 04:17:58 +08:00
if ( ! r - > inv & & ! r - > seg_split & & r - > cnt < opt - > min_cnt ) flt = 1 ;
2017-07-01 10:15:45 +08:00
if ( r - > p ) {
2017-10-16 22:55:18 +08:00
if ( r - > mlen < opt - > min_chain_score ) flt = 1 ;
2017-07-01 10:15:45 +08:00
else if ( r - > p - > dp_max < opt - > min_dp_max ) flt = 1 ;
if ( flt ) free ( r - > p ) ;
}
2017-07-03 22:44:26 +08:00
if ( ! flt ) {
2017-07-01 10:15:45 +08:00
if ( k < i ) regs [ k + + ] = regs [ i ] ;
else + + k ;
}
}
* n_regs = k ;
}
2017-10-16 22:38:22 +08:00
void mm_filter_by_identity ( void * km , int n_regs , mm_reg1_t * regs , float min_iden , int qlen , const char * qual ) // TODO: make sure it is not beyond the ends of contigs
{
int i , j , n_aux = 0 , en , blen = 0 ;
uint64_t * aux ;
float n_diff = 0.0f ;
if ( n_regs < = 0 ) return ;
for ( i = 0 ; i < n_regs ; + + i )
if ( regs [ i ] . id = = regs [ i ] . parent & & regs [ i ] . pe_thru ) // sequenced through the fragment; don't filter
return ;
for ( i = 0 ; i < n_regs ; + + i )
if ( regs [ i ] . id = = regs [ i ] . parent )
+ + n_aux ;
assert ( n_aux > = 1 ) ;
aux = ( uint64_t * ) kmalloc ( km , n_aux * 8 ) ;
for ( i = 0 , n_aux = 0 ; i < n_regs ; + + i )
if ( regs [ i ] . id = = regs [ i ] . parent )
aux [ n_aux + + ] = ( uint64_t ) regs [ i ] . qs < < 32 | i ;
radix_sort_64 ( aux , aux + n_aux ) ;
for ( i = 0 , en = 0 ; i < n_aux ; + + i ) {
mm_reg1_t * r = & regs [ ( int32_t ) aux [ i ] ] ;
if ( r - > qs > en ) {
for ( j = en ; j < r - > qs ; + + j )
2017-10-21 04:57:31 +08:00
n_diff + = qual = = 0 | | qual [ j ] > = 53 ? .25f : .05f * .25f * ( qual [ j ] - 33 ) ;
2017-10-16 22:38:22 +08:00
blen + = r - > qs - en ;
}
assert ( r - > p ) ;
blen + = r - > p - > blen2 ;
n_diff + = r - > p - > n_diff2 ;
en = en > r - > qe ? en : r - > qe ;
}
for ( j = en ; j < qlen ; + + j )
2017-10-21 04:57:31 +08:00
n_diff + = qual = = 0 | | qual [ j ] > = 53 ? .25f : .05f * .25f * ( qual [ j ] - 33 ) ;
2017-10-16 22:38:22 +08:00
blen + = qlen - en ;
kfree ( km , aux ) ;
if ( 1.0f - n_diff / blen < min_iden )
for ( i = 0 ; i < n_regs ; + + i )
regs [ i ] . iden_flt = 1 ;
}
2017-06-30 07:35:38 +08:00
int mm_squeeze_a ( void * km , int n_regs , mm_reg1_t * regs , mm128_t * a )
{ // squeeze out regions in a[] that are not referenced by regs[]
2017-07-01 01:27:47 +08:00
int i , as = 0 ;
2017-06-30 07:35:38 +08:00
uint64_t * aux ;
aux = ( uint64_t * ) kmalloc ( km , n_regs * 8 ) ;
2017-07-01 01:27:47 +08:00
for ( i = 0 ; i < n_regs ; + + i )
aux [ i ] = ( uint64_t ) regs [ i ] . as < < 32 | i ;
radix_sort_64 ( aux , aux + n_regs ) ;
2017-06-30 07:35:38 +08:00
for ( i = 0 ; i < n_regs ; + + i ) {
2017-07-01 01:27:47 +08:00
mm_reg1_t * r = & regs [ ( int32_t ) aux [ i ] ] ;
2017-06-30 07:35:38 +08:00
if ( r - > as ! = as ) {
memmove ( & a [ as ] , & a [ r - > as ] , r - > cnt * 16 ) ;
r - > as = as ;
}
as + = r - > cnt ;
}
kfree ( km , aux ) ;
return as ;
}
2017-07-03 07:08:30 +08:00
void mm_join_long ( void * km , const mm_mapopt_t * opt , int qlen , int * n_regs_ , mm_reg1_t * regs , mm128_t * a )
2017-06-30 07:35:38 +08:00
{
2017-07-03 07:08:30 +08:00
int i , n_aux , n_regs = * n_regs_ , n_drop = 0 ;
2017-06-30 07:35:38 +08:00
uint64_t * aux ;
if ( n_regs < 2 ) return ; // nothing to join
mm_squeeze_a ( km , n_regs , regs , a ) ;
aux = ( uint64_t * ) kmalloc ( km , n_regs * 8 ) ;
for ( i = n_aux = 0 ; i < n_regs ; + + i )
if ( regs [ i ] . parent = = i | | regs [ i ] . parent < 0 )
aux [ n_aux + + ] = ( uint64_t ) regs [ i ] . as < < 32 | i ;
radix_sort_64 ( aux , aux + n_aux ) ;
for ( i = n_aux - 1 ; i > = 1 ; - - i ) {
mm_reg1_t * r0 = & regs [ ( int32_t ) aux [ i - 1 ] ] , * r1 = & regs [ ( int32_t ) aux [ i ] ] ;
mm128_t * a0e , * a1s ;
2017-07-09 22:38:51 +08:00
int max_gap , min_gap , sc_thres ;
2017-06-30 07:35:38 +08:00
// test
if ( r0 - > as + r0 - > cnt ! = r1 - > as ) continue ; // not adjacent in a[]
if ( r0 - > rid ! = r1 - > rid | | r0 - > rev ! = r1 - > rev ) continue ; // make sure on the same target and strand
a0e = & a [ r0 - > as + r0 - > cnt - 1 ] ;
a1s = & a [ r1 - > as ] ;
if ( a1s - > x < = a0e - > x | | ( int32_t ) a1s - > y < = ( int32_t ) a0e - > y ) continue ; // keep colinearity
max_gap = min_gap = ( int32_t ) a1s - > y - ( int32_t ) a0e - > y ;
max_gap = max_gap > a1s - > x - a0e - > x ? max_gap : a1s - > x - a0e - > x ;
min_gap = min_gap < a1s - > x - a0e - > x ? min_gap : a1s - > x - a0e - > x ;
if ( max_gap > opt - > max_join_long | | min_gap > opt - > max_join_short ) continue ;
2017-07-10 00:14:20 +08:00
sc_thres = ( int ) ( ( float ) opt - > min_join_flank_sc / opt - > max_join_long * max_gap + .499 ) ;
2017-07-09 22:38:51 +08:00
if ( r0 - > score < sc_thres | | r1 - > score < sc_thres ) continue ; // require good flanking chains
2017-07-10 00:14:20 +08:00
if ( r0 - > re - r0 - > rs < max_gap > > 1 | | r0 - > qe - r0 - > qs < max_gap > > 1 ) continue ; // require enough flanking length
if ( r1 - > re - r1 - > rs < max_gap > > 1 | | r1 - > qe - r1 - > qs < max_gap > > 1 ) continue ;
2017-06-30 07:35:38 +08:00
// all conditions satisfied; join
2017-07-28 23:54:18 +08:00
a [ r1 - > as ] . y | = MM_SEED_LONG_JOIN ;
2017-06-30 07:35:38 +08:00
r0 - > cnt + = r1 - > cnt , r0 - > score + = r1 - > score ;
mm_reg_set_coor ( r0 , qlen , a ) ;
r1 - > cnt = 0 ;
2017-07-03 07:08:30 +08:00
r1 - > parent = r0 - > id ;
+ + n_drop ;
2017-06-30 07:35:38 +08:00
}
kfree ( km , aux ) ;
2017-07-03 07:08:30 +08:00
if ( n_drop > 0 ) { // then fix the hits hierarchy
for ( i = 0 ; i < n_regs ; + + i ) { // adjust the mm_reg1_t::parent
mm_reg1_t * r = & regs [ i ] ;
if ( r - > parent > = 0 & & r - > id ! = r - > parent ) { // fix for secondary hits only
if ( regs [ r - > parent ] . parent > = 0 & & regs [ r - > parent ] . parent ! = r - > parent )
r - > parent = regs [ r - > parent ] . parent ;
}
}
mm_filter_regs ( km , opt , n_regs_ , regs ) ;
2017-07-03 23:36:34 +08:00
mm_sync_regs ( km , * n_regs_ , regs ) ;
2017-07-03 07:08:30 +08:00
}
2017-06-30 07:35:38 +08:00
}
2017-09-21 02:35:46 +08:00
2017-10-05 01:05:18 +08:00
mm_seg_t * mm_seg_gen ( void * km , uint32_t hash , int n_segs , const int * qlens , int n_regs0 , const mm_reg1_t * regs0 , int * n_regs , mm_reg1_t * * regs , const mm128_t * a )
2017-09-21 02:35:46 +08:00
{
int s , i , j , acc_qlen [ MM_MAX_SEG + 1 ] , qlen_sum = 0 ;
mm_seg_t * seg ;
assert ( n_segs < = MM_MAX_SEG ) ;
for ( s = 1 , acc_qlen [ 0 ] = 0 ; s < n_segs ; + + s )
acc_qlen [ s ] = acc_qlen [ s - 1 ] + qlens [ s - 1 ] ;
qlen_sum = acc_qlen [ n_segs - 1 ] + qlens [ n_segs - 1 ] ;
seg = ( mm_seg_t * ) kcalloc ( km , n_segs , sizeof ( mm_seg_t ) ) ;
for ( s = 0 ; s < n_segs ; + + s ) {
seg [ s ] . u = ( uint64_t * ) kmalloc ( km , n_regs0 * 8 ) ;
for ( i = 0 ; i < n_regs0 ; + + i )
seg [ s ] . u [ i ] = ( uint64_t ) regs0 [ i ] . score < < 32 ;
}
2017-09-20 23:47:46 +08:00
for ( i = 0 ; i < n_regs0 ; + + i ) {
2017-09-21 02:35:46 +08:00
const mm_reg1_t * r = & regs0 [ i ] ;
2017-09-20 23:47:46 +08:00
for ( j = 0 ; j < r - > cnt ; + + j ) {
2017-09-21 02:35:46 +08:00
int sid = ( a [ r - > as + j ] . y & MM_SEED_SEG_MASK ) > > MM_SEED_SEG_SHIFT ;
+ + seg [ sid ] . u [ i ] ;
+ + seg [ sid ] . n_a ;
2017-09-20 23:47:46 +08:00
}
}
2017-09-21 02:35:46 +08:00
for ( s = 0 ; s < n_segs ; + + s ) {
mm_seg_t * sr = & seg [ s ] ;
for ( i = 0 , sr - > n_u = 0 ; i < n_regs0 ; + + i ) // squeeze out zero-length per-segment chains
if ( ( int32_t ) sr - > u [ i ] ! = 0 )
sr - > u [ sr - > n_u + + ] = sr - > u [ i ] ;
sr - > a = ( mm128_t * ) kmalloc ( km , sr - > n_a * sizeof ( mm128_t ) ) ;
sr - > n_a = 0 ;
}
for ( i = 0 ; i < n_regs0 ; + + i ) {
const mm_reg1_t * r = & regs0 [ i ] ;
for ( j = 0 ; j < r - > cnt ; + + j ) {
int sid = ( a [ r - > as + j ] . y & MM_SEED_SEG_MASK ) > > MM_SEED_SEG_SHIFT ;
mm128_t a1 = a [ r - > as + j ] ;
// on reverse strand, the segment position is:
// x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
// (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
a1 . y - = a1 . x > > 63 ? qlen_sum - ( qlens [ sid ] + acc_qlen [ sid ] ) : acc_qlen [ sid ] ;
seg [ sid ] . a [ seg [ sid ] . n_a + + ] = a1 ;
}
}
for ( s = 0 ; s < n_segs ; + + s ) {
2017-10-05 01:05:18 +08:00
regs [ s ] = mm_gen_regs ( km , hash , qlens [ s ] , seg [ s ] . n_u , seg [ s ] . u , seg [ s ] . a ) ;
2017-09-21 02:35:46 +08:00
n_regs [ s ] = seg [ s ] . n_u ;
2017-09-27 04:17:58 +08:00
for ( i = 0 ; i < n_regs [ s ] ; + + i )
regs [ s ] [ i ] . seg_split = 1 ;
2017-09-21 02:35:46 +08:00
}
return seg ;
}
void mm_seg_free ( void * km , int n_segs , mm_seg_t * segs )
{
int i ;
for ( i = 0 ; i < n_segs ; + + i ) kfree ( km , segs [ i ] . u ) ;
for ( i = 0 ; i < n_segs ; + + i ) kfree ( km , segs [ i ] . a ) ;
kfree ( km , segs ) ;
2017-09-20 23:47:46 +08:00
}
2017-09-21 02:35:46 +08:00
2017-11-01 02:25:09 +08:00
void mm_set_mapq ( int n_regs , mm_reg1_t * regs , int min_chain_sc , int match_sc , int rep_len , int is_sr )
2017-06-30 07:35:38 +08:00
{
2017-09-15 00:37:03 +08:00
static const float q_coef = 40.0f ;
2017-06-30 07:35:38 +08:00
int i ;
for ( i = 0 ; i < n_regs ; + + i ) {
mm_reg1_t * r = & regs [ i ] ;
2017-07-30 01:09:10 +08:00
if ( r - > inv ) {
r - > mapq = 0 ;
} else if ( r - > parent = = r - > id ) {
2017-07-26 23:34:14 +08:00
int mapq , subsc ;
2017-09-29 02:23:14 +08:00
float pen_s1 = ( r - > score > 100 ? 1.0f : 0.01f * r - > score ) * ( ( float ) r - > score / ( r - > score + rep_len ) ) ;
2017-09-14 22:46:14 +08:00
float pen_cm = r - > cnt > 10 ? 1.0f : 0.1f * r - > cnt ;
pen_cm = pen_s1 < pen_cm ? pen_s1 : pen_cm ;
2017-07-26 23:34:14 +08:00
subsc = r - > subsc > min_chain_sc ? r - > subsc : min_chain_sc ;
2017-07-17 09:45:39 +08:00
if ( r - > p & & r - > p - > dp_max2 > 0 & & r - > p - > dp_max > 0 ) {
2017-10-16 22:55:18 +08:00
float identity = ( float ) r - > mlen / r - > blen ;
2017-11-01 02:25:09 +08:00
float x = ( float ) r - > p - > dp_max2 * subsc / r - > p - > dp_max / r - > score ;
if ( is_sr ) {
mapq = ( int ) ( identity * pen_cm * q_coef * ( 1.0f - x * x ) * logf ( ( float ) r - > p - > dp_max / match_sc ) ) ;
} else {
int mapq_alt = ( int ) ( 6.02f * identity * identity * ( r - > p - > dp_max - r - > p - > dp_max2 ) / match_sc + .499f ) ; // BWA-MEM like mapQ, mostly for short reads
mapq = ( int ) ( identity * pen_cm * q_coef * ( 1.0f - x ) * logf ( r - > score ) ) ; // more for long reads
mapq = mapq < mapq_alt ? mapq : mapq_alt ; // in case the long-read heuristic fails
}
} else {
float x = ( float ) subsc / r - > score ;
if ( is_sr & & r - > p ) {
float identity = ( float ) r - > mlen / r - > blen ;
mapq = ( int ) ( identity * pen_cm * q_coef * ( 1.0f - x ) * logf ( ( float ) r - > p - > dp_max / match_sc ) ) ;
} else {
mapq = ( int ) ( pen_cm * q_coef * ( 1.0f - x ) * logf ( r - > score ) ) ;
}
}
2017-09-15 00:37:03 +08:00
mapq - = ( int ) ( 4.343f * logf ( r - > n_sub + 1 ) + .499f ) ;
2017-06-30 07:35:38 +08:00
mapq = mapq > 0 ? mapq : 0 ;
r - > mapq = mapq < 60 ? mapq : 60 ;
2017-09-28 11:51:05 +08:00
if ( r - > p & & r - > p - > dp_max > r - > p - > dp_max2 & & r - > mapq = = 0 ) r - > mapq = 1 ;
2017-06-30 07:35:38 +08:00
} else r - > mapq = 0 ;
}
}