2025-10-23 10:33:55 +08:00
/* The MIT License
Copyright ( c ) 2018 - Dana - Farber Cancer Institute
2009 - 2018 Broad Institute , Inc .
2008 - 2009 Genome Research Ltd . ( GRL )
Permission is hereby granted , free of charge , to any person obtaining
a copy of this software and associated documentation files ( the
" Software " ) , to deal in the Software without restriction , including
without limitation the rights to use , copy , modify , merge , publish ,
distribute , sublicense , and / or sell copies of the Software , and to
permit persons to whom the Software is furnished to do so , subject to
the following conditions :
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software .
THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE .
*/
# include <limits.h>
# include "bwa.h"
# include "bwamem.h"
# include "bntseq.h"
# include "kstring.h"
/***************************
* SMEM iterator interface *
* * * * * * * * * * * * * * * * * * * * * * * * * * */
struct __smem_i {
const bwt_t * bwt ;
const uint8_t * query ;
int start , len ;
int min_intv , max_len ;
uint64_t max_intv ;
bwtintv_v * matches ; // matches; to be returned by smem_next()
bwtintv_v * sub ; // sub-matches inside the longest match; temporary
bwtintv_v * tmpvec [ 2 ] ; // temporary arrays
} ;
smem_i * smem_itr_init ( const bwt_t * bwt )
{
smem_i * itr ;
itr = calloc ( 1 , sizeof ( smem_i ) ) ;
itr - > bwt = bwt ;
itr - > tmpvec [ 0 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > tmpvec [ 1 ] = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > matches = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > sub = calloc ( 1 , sizeof ( bwtintv_v ) ) ;
itr - > min_intv = 1 ;
itr - > max_len = INT_MAX ;
itr - > max_intv = 0 ;
return itr ;
}
void smem_itr_destroy ( smem_i * itr )
{
free ( itr - > tmpvec [ 0 ] - > a ) ; free ( itr - > tmpvec [ 0 ] ) ;
free ( itr - > tmpvec [ 1 ] - > a ) ; free ( itr - > tmpvec [ 1 ] ) ;
free ( itr - > matches - > a ) ; free ( itr - > matches ) ;
free ( itr - > sub - > a ) ; free ( itr - > sub ) ;
free ( itr ) ;
}
void smem_set_query ( smem_i * itr , int len , const uint8_t * query )
{
itr - > query = query ;
itr - > start = 0 ;
itr - > len = len ;
}
void smem_config ( smem_i * itr , int min_intv , int max_len , uint64_t max_intv )
{
itr - > min_intv = min_intv ;
itr - > max_len = max_len ;
itr - > max_intv = max_intv ;
}
const bwtintv_v * smem_next ( smem_i * itr )
{
int ori_start ;
itr - > tmpvec [ 0 ] - > n = itr - > tmpvec [ 1 ] - > n = itr - > matches - > n = itr - > sub - > n = 0 ;
if ( itr - > start > = itr - > len | | itr - > start < 0 ) return 0 ;
while ( itr - > start < itr - > len & & itr - > query [ itr - > start ] > 3 ) + + itr - > start ; // skip ambiguous bases
if ( itr - > start = = itr - > len ) return 0 ;
ori_start = itr - > start ;
itr - > start = bwt_smem1a ( itr - > bwt , itr - > len , itr - > query , ori_start , itr - > min_intv , itr - > max_intv , itr - > matches , itr - > tmpvec ) ; // search for SMEM
return itr - > matches ;
}
/***********************
* * * Extra functions * * *
* * * * * * * * * * * * * * * * * * * * * * */
mem_alnreg_v mem_align1 ( const mem_opt_t * opt , const bwt_t * bwt , const bntseq_t * bns , const uint8_t * pac , int l_seq , const char * seq_ )
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
2025-11-16 01:37:21 +08:00
// extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
2025-10-23 10:33:55 +08:00
extern void mem_mark_primary_se ( const mem_opt_t * opt , int n , mem_alnreg_t * a , int64_t id ) ;
2025-11-16 01:37:21 +08:00
mem_alnreg_v ar = { 0 , 0 , 0 } ;
2025-10-23 10:33:55 +08:00
char * seq ;
seq = malloc ( l_seq ) ;
memcpy ( seq , seq_ , l_seq ) ; // makes a copy of seq_
2025-11-16 01:37:21 +08:00
// ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
// mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
2025-10-23 10:33:55 +08:00
free ( seq ) ;
return ar ;
}
static inline int get_pri_idx ( double XA_drop_ratio , const mem_alnreg_t * a , int i )
{
int k = a [ i ] . secondary_all ;
if ( k > = 0 & & a [ i ] . score > = a [ k ] . score * XA_drop_ratio ) return k ;
return - 1 ;
}
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char * * mem_gen_alt ( const mem_opt_t * opt , const bntseq_t * bns , const uint8_t * pac , const mem_alnreg_v * a , int l_query , const char * query ) // ONLY work after mem_mark_primary_se()
{
int i , k , r , * cnt , tot ;
kstring_t * aln = 0 , str = { 0 , 0 , 0 } ;
char * * XA = 0 , * has_alt ;
cnt = calloc ( a - > n , sizeof ( int ) ) ;
has_alt = calloc ( a - > n , 1 ) ;
for ( i = 0 , tot = 0 ; i < a - > n ; + + i ) {
r = get_pri_idx ( opt - > XA_drop_ratio , a - > a , i ) ;
if ( r > = 0 ) {
+ + cnt [ r ] , + + tot ;
if ( a - > a [ i ] . is_alt ) has_alt [ r ] = 1 ;
}
}
if ( tot = = 0 ) goto end_gen_alt ;
aln = calloc ( a - > n , sizeof ( kstring_t ) ) ;
for ( i = 0 ; i < a - > n ; + + i ) {
mem_aln_t t ;
if ( ( r = get_pri_idx ( opt - > XA_drop_ratio , a - > a , i ) ) < 0 ) continue ;
if ( cnt [ r ] > opt - > max_XA_hits_alt | | ( ! has_alt [ r ] & & cnt [ r ] > opt - > max_XA_hits ) ) continue ;
t = mem_reg2aln ( opt , bns , pac , l_query , query , & a - > a [ i ] ) ;
str . l = 0 ;
kputs ( bns - > anns [ t . rid ] . name , & str ) ;
kputc ( ' , ' , & str ) ; kputc ( " +- " [ t . is_rev ] , & str ) ; kputl ( t . pos + 1 , & str ) ;
kputc ( ' , ' , & str ) ;
for ( k = 0 ; k < t . n_cigar ; + + k ) {
kputw ( t . cigar [ k ] > > 4 , & str ) ;
kputc ( " MIDSHN " [ t . cigar [ k ] & 0xf ] , & str ) ;
}
kputc ( ' , ' , & str ) ; kputw ( t . NM , & str ) ;
if ( opt - > flag & MEM_F_XB ) {
kputc ( ' , ' , & str ) ;
kputw ( t . score , & str ) ;
kputc ( ' , ' , & str ) ;
kputw ( t . mapq , & str ) ;
}
kputc ( ' ; ' , & str ) ;
free ( t . cigar ) ;
kputsn ( str . s , str . l , & aln [ r ] ) ;
}
XA = calloc ( a - > n , sizeof ( char * ) ) ;
for ( k = 0 ; k < a - > n ; + + k )
XA [ k ] = aln [ k ] . s ;
end_gen_alt :
free ( has_alt ) ; free ( cnt ) ; free ( aln ) ; free ( str . s ) ;
return XA ;
}