2020-07-02 11:02:01 +08:00
/* The MIT License
Copyright ( c ) 2018 - Dana - Farber Cancer Institute
2009 - 2018 Broad Institute , Inc .
2008 - 2009 Genome Research Ltd . ( GRL )
Permission is hereby granted , free of charge , to any person obtaining
a copy of this software and associated documentation files ( the
" Software " ) , to deal in the Software without restriction , including
without limitation the rights to use , copy , modify , merge , publish ,
distribute , sublicense , and / or sell copies of the Software , and to
permit persons to whom the Software is furnished to do so , subject to
the following conditions :
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software .
THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE .
*/
2014-08-25 23:59:27 +08:00
# include <limits.h>
2014-04-09 04:29:36 +08:00
# include "bwa.h"
# include "bwamem.h"
# include "bntseq.h"
# include "kstring.h"
/***************************
* SMEM iterator interface *
* * * * * * * * * * * * * * * * * * * * * * * * * * */
struct __smem_i {
const bwt_t * bwt ;
const uint8_t * query ;
int start , len ;
2014-08-25 23:59:27 +08:00
int min_intv , max_len ;
uint64_t max_intv ;
2014-04-09 04:29:36 +08:00
bwtintv_v * matches ; // matches; to be returned by smem_next()
bwtintv_v * sub ; // sub-matches inside the longest match; temporary
bwtintv_v * tmpvec [ 2 ] ; // temporary arrays
} ;
smem_i * smem_itr_init ( const bwt_t * bwt )
{
smem_i * itr ;
itr = calloc ( 1 , sizeof ( smem_i ) ) ;
itr - > bwt = bwt ;
itr - > tmpvec [ 0 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > tmpvec [ 1 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > matches = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > sub = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
2014-08-25 23:59:27 +08:00
itr - > min_intv = 1 ;
itr - > max_len = INT_MAX ;
itr - > max_intv = 0 ;
2014-04-09 04:29:36 +08:00
return itr ;
}
void smem_itr_destroy ( smem_i * itr )
{
free ( itr - > tmpvec [ 0 ] - > a ) ; free ( itr - > tmpvec [ 0 ] ) ;
free ( itr - > tmpvec [ 1 ] - > a ) ; free ( itr - > tmpvec [ 1 ] ) ;
free ( itr - > matches - > a ) ; free ( itr - > matches ) ;
free ( itr - > sub - > a ) ; free ( itr - > sub ) ;
free ( itr ) ;
}
void smem_set_query ( smem_i * itr , int len , const uint8_t * query )
{
itr - > query = query ;
itr - > start = 0 ;
itr - > len = len ;
}
2014-08-25 23:59:27 +08:00
void smem_config ( smem_i * itr , int min_intv , int max_len , uint64_t max_intv )
{
itr - > min_intv = min_intv ;
itr - > max_len = max_len ;
itr - > max_intv = max_intv ;
}
2014-04-09 04:29:36 +08:00
const bwtintv_v * smem_next ( smem_i * itr )
{
2014-09-18 01:05:35 +08:00
int ori_start ;
2014-04-09 04:29:36 +08:00
itr - > tmpvec [ 0 ] - > n = itr - > tmpvec [ 1 ] - > n = itr - > matches - > n = itr - > sub - > n = 0 ;
if ( itr - > start > = itr - > len | | itr - > start < 0 ) return 0 ;
while ( itr - > start < itr - > len & & itr - > query [ itr - > start ] > 3 ) + + itr - > start ; // skip ambiguous bases
if ( itr - > start = = itr - > len ) return 0 ;
ori_start = itr - > start ;
2014-10-21 05:00:31 +08:00
itr - > start = bwt_smem1a ( itr - > bwt , itr - > len , itr - > query , ori_start , itr - > min_intv , itr - > max_intv , itr - > matches , itr - > tmpvec ) ; // search for SMEM
2014-04-09 04:29:36 +08:00
return itr - > matches ;
}
/***********************
* * * Extra functions * * *
* * * * * * * * * * * * * * * * * * * * * * */
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , const char * seq_ )
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
2014-05-16 11:23:04 +08:00
extern mem_alnreg_v mem_align1_core ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , char * seq , void * buf ) ;
2014-04-09 04:29:36 +08:00
extern void mem_mark_primary_se ( const mem_opt_t * opt , int n , mem_alnreg_t * a , int64_t id ) ;
mem_alnreg_v ar ;
char * seq ;
seq = malloc ( l_seq ) ;
memcpy ( seq , seq_ , l_seq ) ; // makes a copy of seq_
2014-05-16 11:23:04 +08:00
ar = mem_align1_core ( opt , bwt , bns , pac , l_seq , seq , 0 ) ;
2014-04-09 04:29:36 +08:00
mem_mark_primary_se ( opt , ar . n , ar . a , lrand48 ( ) ) ;
free ( seq ) ;
return ar ;
}
2014-10-01 01:50:51 +08:00
static inline int get_pri_idx ( double XA_drop_ratio , const mem_alnreg_t * a , int i )
2014-09-20 04:50:21 +08:00
{
2014-10-01 01:50:51 +08:00
int k = a [ i ] . secondary_all ;
if ( k > = 0 & & a [ i ] . score > = a [ k ] . score * XA_drop_ratio ) return k ;
return - 1 ;
2014-09-20 04:50:21 +08:00
}
2014-05-01 04:46:05 +08:00
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char * * mem_gen_alt ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , const mem_alnreg_v * a , int l_query , const char * query ) // ONLY work after mem_mark_primary_se()
{
2014-10-01 01:50:51 +08:00
int i , k , r , * cnt , tot ;
2014-09-20 04:50:21 +08:00
kstring_t * aln = 0 , str = { 0 , 0 , 0 } ;
2014-10-01 01:50:51 +08:00
char * * XA = 0 , * has_alt ;
2014-05-01 04:46:05 +08:00
cnt = calloc ( a - > n , sizeof ( int ) ) ;
2014-10-01 01:50:51 +08:00
has_alt = calloc ( a - > n , 1 ) ;
2014-05-01 04:46:05 +08:00
for ( i = 0 , tot = 0 ; i < a - > n ; + + i ) {
2014-10-01 01:50:51 +08:00
r = get_pri_idx ( opt - > XA_drop_ratio , a - > a , i ) ;
if ( r > = 0 ) {
+ + cnt [ r ] , + + tot ;
if ( a - > a [ i ] . is_alt ) has_alt [ r ] = 1 ;
}
2014-05-01 04:46:05 +08:00
}
if ( tot = = 0 ) goto end_gen_alt ;
aln = calloc ( a - > n , sizeof ( kstring_t ) ) ;
for ( i = 0 ; i < a - > n ; + + i ) {
mem_aln_t t ;
2014-10-01 01:50:51 +08:00
if ( ( r = get_pri_idx ( opt - > XA_drop_ratio , a - > a , i ) ) < 0 ) continue ;
if ( cnt [ r ] > opt - > max_XA_hits_alt | | ( ! has_alt [ r ] & & cnt [ r ] > opt - > max_XA_hits ) ) continue ;
2014-05-01 04:46:05 +08:00
t = mem_reg2aln ( opt , bns , pac , l_query , query , & a - > a [ i ] ) ;
2014-09-20 04:50:21 +08:00
str . l = 0 ;
kputs ( bns - > anns [ t . rid ] . name , & str ) ;
kputc ( ' , ' , & str ) ; kputc ( " +- " [ t . is_rev ] , & str ) ; kputl ( t . pos + 1 , & str ) ;
kputc ( ' , ' , & str ) ;
2014-05-01 04:46:05 +08:00
for ( k = 0 ; k < t . n_cigar ; + + k ) {
2014-09-20 04:50:21 +08:00
kputw ( t . cigar [ k ] > > 4 , & str ) ;
kputc ( " MIDSHN " [ t . cigar [ k ] & 0xf ] , & str ) ;
2014-05-01 04:46:05 +08:00
}
2014-09-20 04:50:21 +08:00
kputc ( ' , ' , & str ) ; kputw ( t . NM , & str ) ;
2018-04-02 22:43:41 +08:00
if ( opt - > flag & MEM_F_XB ) {
kputc ( ' , ' , & str ) ;
kputw ( t . score , & str ) ;
}
2014-09-20 04:50:21 +08:00
kputc ( ' ; ' , & str ) ;
2014-05-01 04:46:05 +08:00
free ( t . cigar ) ;
2014-10-01 01:50:51 +08:00
kputsn ( str . s , str . l , & aln [ r ] ) ;
2014-05-01 04:46:05 +08:00
}
XA = calloc ( a - > n , sizeof ( char * ) ) ;
for ( k = 0 ; k < a - > n ; + + k )
XA [ k ] = aln [ k ] . s ;
end_gen_alt :
2014-10-01 01:50:51 +08:00
free ( has_alt ) ; free ( cnt ) ; free ( aln ) ; free ( str . s ) ;
2014-05-01 04:46:05 +08:00
return XA ;
}