2020-07-02 11:02:01 +08:00
/* The MIT License
Copyright ( c ) 2018 - Dana - Farber Cancer Institute
2009 - 2018 Broad Institute , Inc .
2008 - 2009 Genome Research Ltd . ( GRL )
Permission is hereby granted , free of charge , to any person obtaining
a copy of this software and associated documentation files ( the
" Software " ) , to deal in the Software without restriction , including
without limitation the rights to use , copy , modify , merge , publish ,
distribute , sublicense , and / or sell copies of the Software , and to
permit persons to whom the Software is furnished to do so , subject to
the following conditions :
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software .
THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE .
*/
2012-04-07 12:23:01 +08:00
# include <string.h>
# include <stdio.h>
2013-02-24 04:30:46 +08:00
# include <zlib.h>
2013-02-26 13:03:49 +08:00
# include <assert.h>
2024-02-07 22:08:51 +08:00
# include <unistd.h>
2024-03-07 18:23:21 +08:00
# include <pthread.h>
2012-04-07 12:23:01 +08:00
# include "bntseq.h"
2013-02-24 04:30:46 +08:00
# include "bwa.h"
# include "ksw.h"
2013-02-24 05:41:44 +08:00
# include "utils.h"
2014-01-30 01:05:11 +08:00
# include "kstring.h"
2014-11-19 03:30:22 +08:00
# include "kvec.h"
2012-04-07 12:23:01 +08:00
2013-05-02 22:12:01 +08:00
# ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
# endif
2013-02-24 05:10:48 +08:00
int bwa_verbose = 3 ;
2021-02-23 12:26:03 +08:00
int bwa_dbg = 0 ;
2013-02-24 05:41:44 +08:00
char bwa_rg_id [ 256 ] ;
2014-10-18 04:17:28 +08:00
char * bwa_pg ;
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
/************************
* Batch FASTA / Q reader *
* * * * * * * * * * * * * * * * * * * * * * * */
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
# include "kseq.h"
KSEQ_DECLARE ( gzFile )
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
static inline void trim_readno ( kstring_t * s )
2012-04-07 12:23:01 +08:00
{
2013-02-24 04:30:46 +08:00
if ( s - > l > 2 & & s - > s [ s - > l - 2 ] = = ' / ' & & isdigit ( s - > s [ s - > l - 1 ] ) )
s - > l - = 2 , s - > s [ s - > l ] = 0 ;
2012-04-07 12:23:01 +08:00
}
2024-03-07 18:23:21 +08:00
static inline void dupkstring ( const kstring_t * str , int dupempty , char * * dstp , int * sm )
2017-06-30 19:46:56 +08:00
{
2024-03-07 18:23:21 +08:00
if ( ! dupempty & & str - > l = = 0 ) {
if ( * dstp ) free ( * dstp ) ;
* dstp = 0 ; * sm = 0 ;
} else if ( * dstp = = 0 | | * sm < str - > l ) {
* sm = str - > l ;
* dstp = realloc ( * dstp , str - > l + 1 ) ;
}
char * s = * dstp ;
if ( ! s ) return ;
2017-06-30 19:46:56 +08:00
memcpy ( s , str - > s , str - > l ) ;
s [ str - > l ] = ' \0 ' ;
}
2024-03-07 18:23:21 +08:00
static inline void kseq2bseq1 ( const kseq_t * ks , bseq1_t * s , int copy_comment )
2013-02-24 04:30:46 +08:00
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
2024-03-07 18:23:21 +08:00
dupkstring ( & ks - > name , 1 , & s - > name , & s - > m_name ) ;
if ( copy_comment ) dupkstring ( & ks - > comment , 0 , & s - > comment , & s - > m_comment ) ;
dupkstring ( & ks - > seq , 1 , & s - > seq , & s - > m_seq ) ;
dupkstring ( & ks - > qual , 0 , & s - > qual , & s - > m_qual ) ;
2017-06-30 19:46:56 +08:00
s - > l_seq = ks - > seq . l ;
2012-04-07 12:23:01 +08:00
}
2024-03-07 18:23:21 +08:00
typedef struct {
kseq_t * ks ;
bseq1_t * seq ;
int start_pos ;
int n_bound ;
int copy_comment ;
int ret_n ;
int ret_size ;
int ret_status ;
int chunk_size ;
} read_data_t ;
static void * thread_bseq_read ( void * data ) {
read_data_t * d = ( read_data_t * ) data ;
kseq_t * ks = d - > ks ;
bseq1_t * seqs = d - > seq ;
int copy_comment = d - > copy_comment ;
int chunk_size = d - > chunk_size ;
int cur_n = 0 , cur_pos = d - > start_pos , size = 0 ;
int ret_status = 1 ;
while ( cur_n < d - > n_bound & & ( ret_status = kseq_read ( ks ) ) > = 0 ) {
trim_readno ( & ks - > name ) ;
kseq2bseq1 ( ks , seqs + cur_pos , copy_comment ) ;
seqs [ cur_pos ] . id = cur_pos ;
size + = seqs [ cur_pos ] . l_seq ;
cur_pos + = 2 ; cur_n + = 1 ;
if ( size > = chunk_size ) break ;
}
d - > ret_n = cur_n ; d - > ret_size = size ; d - > ret_status = ret_status ;
return 0 ;
}
2024-03-24 04:40:09 +08:00
# define READ_ONE_SEQ(ksin) \
trim_readno ( & ( ksin ) - > name ) ; \
kseq2bseq1 ( ksin , & seqs [ n ] , copy_comment ) ; \
seqs [ n ] . id = n ; \
2024-03-07 18:23:21 +08:00
size + = seqs [ n + + ] . l_seq ;
// multi thread reading input seqs
void bseq_read_pe_mt ( int chunk_size , int * n_ , void * ks1_ , void * ks2_ , int copy_comment , int64_t * size_ , int * m_ , bseq1_t * * seqs_ptr )
2012-04-07 12:23:01 +08:00
{
2024-03-07 18:23:21 +08:00
kseq_t * ks = ( kseq_t * ) ks1_ , * ks2 = ( kseq_t * ) ks2_ ;
int size = 0 , m = * m_ , n = 0 ;
bseq1_t * seqs = * seqs_ptr ;
read_data_t d [ 2 ] ;
pthread_t tid [ 2 ] ;
2024-03-24 04:40:09 +08:00
const int chunk_size_narrow = 4 * 1024 * 1024 ;
const int init_n_reads = 20 ;
2024-03-07 18:23:21 +08:00
if ( m = = 0 ) { // 还没开辟空间,要初始化
2024-03-24 04:40:09 +08:00
seqs = calloc ( init_n_reads , sizeof ( bseq1_t ) ) ; // 先读取20个reads, 根据reads的长度和chunk size决定要读取多少条reads
# if 1
2024-03-07 18:23:21 +08:00
int ks1_ret = 0 , ks2_ret = 0 ;
2024-03-24 04:40:09 +08:00
int i = init_n_reads > > 1 ;
2024-03-07 18:23:21 +08:00
while ( i - - > 0 ) {
ks1_ret = kseq_read ( ks ) ;
if ( ks1_ret < 0 ) break ;
ks2_ret = kseq_read ( ks2 ) ;
if ( ks2_ret < 0 ) {
fprintf ( stderr , " [W::%s] the 2nd file has fewer sequences. \n " , __func__ ) ;
break ;
}
READ_ONE_SEQ ( ks ) ;
READ_ONE_SEQ ( ks2 ) ;
}
if ( ks1_ret < 0 | | ks2_ret < 0 ) {
if ( size = = 0 & & kseq_read ( ks2 ) > = 0 ) { // test if the 2nd file is finished
fprintf ( stderr , " [W::%s] the 1st file has fewer sequences. \n " , __func__ ) ;
}
* n_ = n ; * seqs_ptr = seqs ;
return ;
}
2024-03-24 04:40:09 +08:00
m = ( chunk_size + size / init_n_reads - 1 ) / ( size / init_n_reads ) ;
# else
m = 50000 ;
# endif
2024-03-07 18:23:21 +08:00
seqs = realloc ( seqs , m * sizeof ( bseq1_t ) ) ;
2024-03-24 04:40:09 +08:00
memset ( seqs + n , 0 , sizeof ( bseq1_t ) * ( m - n ) ) ;
2024-03-07 18:23:21 +08:00
}
d [ 0 ] . copy_comment = copy_comment ; d [ 1 ] . copy_comment = copy_comment ;
d [ 0 ] . ks = ks ; d [ 0 ] . seq = & seqs [ 0 ] ; d [ 0 ] . n_bound = ( m > > 1 ) - ( n > > 1 ) ; d [ 0 ] . start_pos = n ;
d [ 1 ] . ks = ks2 ; d [ 1 ] . seq = & seqs [ 0 ] ; d [ 1 ] . n_bound = ( m > > 1 ) - ( n > > 1 ) ; d [ 1 ] . start_pos = n + 1 ;
2024-03-24 04:40:09 +08:00
d [ 0 ] . chunk_size = d [ 1 ] . chunk_size = ( chunk_size - chunk_size_narrow - size ) > > 1 ;
2024-03-07 18:23:21 +08:00
pthread_create ( & tid [ 0 ] , 0 , thread_bseq_read , & d [ 0 ] ) ;
pthread_create ( & tid [ 1 ] , 0 , thread_bseq_read , & d [ 1 ] ) ;
pthread_join ( tid [ 0 ] , 0 ) ; pthread_join ( tid [ 1 ] , 0 ) ;
size + = d [ 0 ] . ret_size + d [ 1 ] . ret_size ;
2024-03-24 04:40:09 +08:00
// 如果两个线程读入的reads数量不一致
if ( d [ 0 ] . ret_n < d [ 1 ] . ret_n )
{
int num_to_read = d [ 1 ] . ret_n - d [ 0 ] . ret_n ;
int offset = n + d [ 0 ] . ret_n * 2 ;
while ( num_to_read - - > 0 & & kseq_read ( ks ) > = 0 ) {
trim_readno ( & ks - > name ) ;
kseq2bseq1 ( ks , & seqs [ offset ] , copy_comment ) ;
seqs [ offset ] . id = offset ;
size + = seqs [ offset ] . l_seq ;
offset + = 2 ;
}
d [ 0 ] . ret_n = d [ 1 ] . ret_n ;
}
else if ( d [ 1 ] . ret_n < d [ 0 ] . ret_n )
{
int num_to_read = d [ 0 ] . ret_n - d [ 1 ] . ret_n ;
int offset = n + 1 + d [ 1 ] . ret_n * 2 ;
while ( num_to_read - - > 0 & & kseq_read ( ks2 ) > = 0 ) {
trim_readno ( & ks2 - > name ) ;
kseq2bseq1 ( ks2 , & seqs [ offset ] , copy_comment ) ;
seqs [ offset ] . id = offset ;
size + = seqs [ offset ] . l_seq ;
offset + = 2 ;
}
d [ 1 ] . ret_n = d [ 0 ] . ret_n ;
}
2024-03-07 18:23:21 +08:00
n + = d [ 0 ] . ret_n + d [ 1 ] . ret_n ;
if ( size = = 0 & & kseq_read ( ks2 ) > = 0 ) { // test if the 2nd file is finished
fprintf ( stderr , " [W::%s] the 1st file has fewer sequences. \n " , __func__ ) ;
} else if ( size < chunk_size & & d [ 0 ] . ret_status > 0 & & d [ 1 ] . ret_status > 0 ) {
while ( kseq_read ( ks ) > = 0 ) {
2024-03-24 04:40:09 +08:00
if ( kseq_read ( ks2 ) < 0 ) { // the 2nd file has fewer reads
2024-03-07 18:23:21 +08:00
fprintf ( stderr , " [W::%s] the 2nd file has fewer sequences. \n " , __func__ ) ;
break ;
}
if ( n > = m ) {
m = m ? m < < 1 : 256 ;
seqs = realloc ( seqs , m * sizeof ( bseq1_t ) ) ;
memset ( seqs + n , 0 , ( m - n ) * sizeof ( bseq1_t ) ) ;
}
READ_ONE_SEQ ( ks ) ;
2024-03-24 04:40:09 +08:00
READ_ONE_SEQ ( ks2 ) ;
2024-03-07 18:23:21 +08:00
if ( size > = chunk_size & & ( n & 1 ) = = 0 ) break ;
}
if ( size = = 0 ) { // test if the 2nd file is finished
2024-03-24 04:40:09 +08:00
if ( kseq_read ( ks2 ) > = 0 ) fprintf ( stderr , " [W::%s] the 1st file has fewer sequences. \n " , __func__ ) ;
2024-03-07 18:23:21 +08:00
}
}
* n_ = n ; * size_ = size ;
if ( m > * m_ ) * m_ = m ;
* seqs_ptr = seqs ;
}
void bseq_read ( int chunk_size , int * n_ , void * ks1_ , void * ks2_ , int copy_comment , int64_t * size_ , int * m_ , bseq1_t * * seqs_ptr )
{
# ifdef USE_MT_READ
if ( ks2_ ) return bseq_read_pe_mt ( chunk_size , n_ , ks1_ , ks2_ , copy_comment , size_ , m_ , seqs_ptr ) ;
# endif
2013-02-24 04:30:46 +08:00
kseq_t * ks = ( kseq_t * ) ks1_ , * ks2 = ( kseq_t * ) ks2_ ;
int size = 0 , m , n ;
2024-03-07 18:23:21 +08:00
bseq1_t * seqs = * seqs_ptr ;
n = 0 ; m = * m_ ;
2013-02-24 04:30:46 +08:00
while ( kseq_read ( ks ) > = 0 ) {
if ( ks2 & & kseq_read ( ks2 ) < 0 ) { // the 2nd file has fewer reads
fprintf ( stderr , " [W::%s] the 2nd file has fewer sequences. \n " , __func__ ) ;
break ;
}
if ( n > = m ) {
m = m ? m < < 1 : 256 ;
2013-05-02 22:12:01 +08:00
seqs = realloc ( seqs , m * sizeof ( bseq1_t ) ) ;
2024-03-07 18:23:21 +08:00
memset ( seqs + n , 0 , ( m - n ) * sizeof ( bseq1_t ) ) ;
2013-02-24 04:30:46 +08:00
}
2024-03-07 18:23:21 +08:00
READ_ONE_SEQ ( ks ) ;
2013-02-24 04:30:46 +08:00
if ( ks2 ) {
2024-03-07 18:23:21 +08:00
READ_ONE_SEQ ( ks2 ) ;
2013-02-24 04:30:46 +08:00
}
2013-03-08 00:00:15 +08:00
if ( size > = chunk_size & & ( n & 1 ) = = 0 ) break ;
2013-02-24 04:30:46 +08:00
}
if ( size = = 0 ) { // test if the 2nd file is finished
if ( ks2 & & kseq_read ( ks2 ) > = 0 )
fprintf ( stderr , " [W::%s] the 1st file has fewer sequences. \n " , __func__ ) ;
}
2024-03-07 18:23:21 +08:00
* n_ = n ; * size_ = size ;
if ( m > * m_ ) * m_ = m ;
* seqs_ptr = seqs ;
2012-04-07 12:23:01 +08:00
}
2014-11-19 03:30:22 +08:00
void bseq_classify ( int n , bseq1_t * seqs , int m [ 2 ] , bseq1_t * sep [ 2 ] )
{
int i , has_last ;
kvec_t ( bseq1_t ) a [ 2 ] = { { 0 , 0 , 0 } , { 0 , 0 , 0 } } ;
for ( i = 1 , has_last = 1 ; i < n ; + + i ) {
if ( has_last ) {
if ( strcmp ( seqs [ i ] . name , seqs [ i - 1 ] . name ) = = 0 ) {
kv_push ( bseq1_t , a [ 1 ] , seqs [ i - 1 ] ) ;
kv_push ( bseq1_t , a [ 1 ] , seqs [ i ] ) ;
has_last = 0 ;
} else kv_push ( bseq1_t , a [ 0 ] , seqs [ i - 1 ] ) ;
} else has_last = 1 ;
}
if ( has_last ) kv_push ( bseq1_t , a [ 0 ] , seqs [ i - 1 ] ) ;
sep [ 0 ] = a [ 0 ] . a , m [ 0 ] = a [ 0 ] . n ;
sep [ 1 ] = a [ 1 ] . a , m [ 1 ] = a [ 1 ] . n ;
}
2013-02-26 11:49:15 +08:00
/*****************
* CIGAR related *
* * * * * * * * * * * * * * * * */
2012-04-07 12:23:01 +08:00
2013-03-05 22:38:12 +08:00
void bwa_fill_scmat ( int a , int b , int8_t mat [ 25 ] )
{
int i , j , k ;
for ( i = k = 0 ; i < 4 ; + + i ) {
for ( j = 0 ; j < 4 ; + + j )
mat [ k + + ] = i = = j ? a : - b ;
2013-03-10 07:03:57 +08:00
mat [ k + + ] = - 1 ; // ambiguous base
2013-03-05 22:38:12 +08:00
}
2013-03-10 07:03:57 +08:00
for ( j = 0 ; j < 5 ; + + j ) mat [ k + + ] = - 1 ;
2013-03-05 22:38:12 +08:00
}
2013-02-24 04:30:46 +08:00
// Generate CIGAR when the alignment end points are known
2014-03-29 02:54:06 +08:00
uint32_t * bwa_gen_cigar2 ( const int8_t mat [ 25 ] , int o_del , int e_del , int o_ins , int e_ins , int w_ , int64_t l_pac , const uint8_t * pac , int l_query , uint8_t * query , int64_t rb , int64_t re , int * score , int * n_cigar , int * NM )
2012-04-07 12:23:01 +08:00
{
2013-02-24 04:30:46 +08:00
uint32_t * cigar = 0 ;
uint8_t tmp , * rseq ;
2013-02-28 12:59:50 +08:00
int i ;
2013-02-24 04:30:46 +08:00
int64_t rlen ;
2014-01-30 01:05:11 +08:00
kstring_t str ;
2014-02-18 23:32:24 +08:00
const char * int2base ;
2014-01-30 01:05:11 +08:00
2014-04-17 04:38:50 +08:00
if ( n_cigar ) * n_cigar = 0 ;
if ( NM ) * NM = - 1 ;
2013-02-24 04:30:46 +08:00
if ( l_query < = 0 | | rb > = re | | ( rb < l_pac & & re > l_pac ) ) return 0 ; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq ( l_pac , pac , rb , re , & rlen ) ;
if ( re - rb ! = rlen ) goto ret_gen_cigar ; // possible if out of range
if ( rb > = l_pac ) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
for ( i = 0 ; i < l_query > > 1 ; + + i )
tmp = query [ i ] , query [ i ] = query [ l_query - 1 - i ] , query [ l_query - 1 - i ] = tmp ;
for ( i = 0 ; i < rlen > > 1 ; + + i )
tmp = rseq [ i ] , rseq [ i ] = rseq [ rlen - 1 - i ] , rseq [ rlen - 1 - i ] = tmp ;
2012-04-07 12:23:01 +08:00
}
2013-02-28 12:59:50 +08:00
if ( l_query = = re - rb & & w_ = = 0 ) { // no gap; no need to do DP
2014-05-01 23:13:05 +08:00
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
if ( n_cigar ) {
cigar = malloc ( 4 ) ;
cigar [ 0 ] = l_query < < 4 | 0 ;
* n_cigar = 1 ;
}
2013-02-28 12:59:50 +08:00
for ( i = 0 , * score = 0 ; i < l_query ; + + i )
* score + = mat [ rseq [ i ] * 5 + query [ i ] ] ;
} else {
2014-03-29 02:54:06 +08:00
int w , max_gap , max_ins , max_del , min_w ;
2013-02-28 12:59:50 +08:00
// set the band-width
2014-03-29 02:54:06 +08:00
max_ins = ( int ) ( ( double ) ( ( ( l_query + 1 ) > > 1 ) * mat [ 0 ] - o_ins ) / e_ins + 1. ) ;
max_del = ( int ) ( ( double ) ( ( ( l_query + 1 ) > > 1 ) * mat [ 0 ] - o_del ) / e_del + 1. ) ;
max_gap = max_ins > max_del ? max_ins : max_del ;
2013-02-28 12:59:50 +08:00
max_gap = max_gap > 1 ? max_gap : 1 ;
2017-06-26 17:45:13 +08:00
w = ( max_gap + abs ( ( int ) rlen - l_query ) + 1 ) > > 1 ;
2013-02-28 12:59:50 +08:00
w = w < w_ ? w : w_ ;
2017-06-26 17:45:13 +08:00
min_w = abs ( ( int ) rlen - l_query ) + 3 ;
2013-02-28 12:59:50 +08:00
w = w > min_w ? w : min_w ;
// NW alignment
2014-03-17 11:25:04 +08:00
if ( bwa_verbose > = 4 ) {
printf ( " * Global bandwidth: %d \n " , w ) ;
printf ( " * Global ref: " ) ; for ( i = 0 ; i < rlen ; + + i ) putchar ( " ACGTN " [ ( int ) rseq [ i ] ] ) ; putchar ( ' \n ' ) ;
printf ( " * Global query: " ) ; for ( i = 0 ; i < l_query ; + + i ) putchar ( " ACGTN " [ ( int ) query [ i ] ] ) ; putchar ( ' \n ' ) ;
}
2014-03-29 02:54:06 +08:00
* score = ksw_global2 ( l_query , query , rlen , rseq , 5 , mat , o_del , e_del , o_ins , e_ins , w , n_cigar , & cigar ) ;
2013-02-28 12:59:50 +08:00
}
2014-04-17 04:38:50 +08:00
if ( NM & & n_cigar ) { // compute NM and MD
2014-01-30 01:05:11 +08:00
int k , x , y , u , n_mm = 0 , n_gap = 0 ;
str . l = str . m = * n_cigar * 4 ; str . s = ( char * ) cigar ; // append MD to CIGAR
2014-02-18 23:32:24 +08:00
int2base = rb < l_pac ? " ACGTN " : " TGCAN " ;
2014-01-30 01:05:11 +08:00
for ( k = 0 , x = y = u = 0 ; k < * n_cigar ; + + k ) {
2014-02-01 01:58:21 +08:00
int op , len ;
cigar = ( uint32_t * ) str . s ;
op = cigar [ k ] & 0xf , len = cigar [ k ] > > 4 ;
2013-02-27 02:36:01 +08:00
if ( op = = 0 ) { // match
2014-01-30 01:05:11 +08:00
for ( i = 0 ; i < len ; + + i ) {
if ( query [ x + i ] ! = rseq [ y + i ] ) {
2014-02-18 23:32:24 +08:00
kputw ( u , & str ) ;
kputc ( int2base [ rseq [ y + i ] ] , & str ) ;
2014-01-30 01:05:11 +08:00
+ + n_mm ; u = 0 ;
} else + + u ;
}
2013-02-27 02:36:01 +08:00
x + = len ; y + = len ;
2014-01-30 01:05:11 +08:00
} else if ( op = = 2 ) { // deletion
2014-02-21 02:06:40 +08:00
if ( k > 0 & & k < * n_cigar - 1 ) { // don't do the following if D is the first or the last CIGAR
2014-02-20 00:31:54 +08:00
kputw ( u , & str ) ; kputc ( ' ^ ' , & str ) ;
for ( i = 0 ; i < len ; + + i )
kputc ( int2base [ rseq [ y + i ] ] , & str ) ;
u = 0 ; n_gap + = len ;
}
y + = len ;
2014-01-30 01:05:11 +08:00
} else if ( op = = 1 ) x + = len , n_gap + = len ; // insertion
2012-04-08 10:00:03 +08:00
}
2014-01-30 01:05:11 +08:00
kputw ( u , & str ) ; kputc ( 0 , & str ) ;
2013-02-27 02:36:01 +08:00
* NM = n_mm + n_gap ;
2014-01-30 01:05:11 +08:00
cigar = ( uint32_t * ) str . s ;
2012-04-08 10:00:03 +08:00
}
2013-02-24 04:30:46 +08:00
if ( rb > = l_pac ) // reverse back query
for ( i = 0 ; i < l_query > > 1 ; + + i )
tmp = query [ i ] , query [ i ] = query [ l_query - 1 - i ] , query [ l_query - 1 - i ] = tmp ;
ret_gen_cigar :
free ( rseq ) ;
return cigar ;
2012-04-08 10:00:03 +08:00
}
2014-03-29 02:54:06 +08:00
uint32_t * bwa_gen_cigar ( const int8_t mat [ 25 ] , int q , int r , int w_ , int64_t l_pac , const uint8_t * pac , int l_query , uint8_t * query , int64_t rb , int64_t re , int * score , int * n_cigar , int * NM )
{
return bwa_gen_cigar2 ( mat , q , r , q , r , w_ , l_pac , pac , l_query , query , rb , re , score , n_cigar , NM ) ;
}
2013-02-24 04:55:55 +08:00
/*********************
* Full index reader *
* * * * * * * * * * * * * * * * * * * * */
2012-04-08 12:55:52 +08:00
2013-02-24 05:10:48 +08:00
char * bwa_idx_infer_prefix ( const char * hint )
2012-04-08 12:02:06 +08:00
{
2013-02-24 05:10:48 +08:00
char * prefix ;
int l_hint ;
FILE * fp ;
l_hint = strlen ( hint ) ;
2024-02-07 22:08:51 +08:00
prefix = malloc ( l_hint + 3 + 4 + 1 + 10 ) ;
2013-02-24 05:10:48 +08:00
strcpy ( prefix , hint ) ;
2024-02-07 22:08:51 +08:00
//strcpy(prefix + l_hint, ".64.bwt");
strcpy ( prefix + l_hint , " .ne.bwt " ) ;
2013-02-24 05:10:48 +08:00
if ( ( fp = fopen ( prefix , " rb " ) ) ! = 0 ) {
fclose ( fp ) ;
prefix [ l_hint + 3 ] = 0 ;
return prefix ;
} else {
strcpy ( prefix + l_hint , " .bwt " ) ;
if ( ( fp = fopen ( prefix , " rb " ) ) = = 0 ) {
free ( prefix ) ;
return 0 ;
} else {
fclose ( fp ) ;
prefix [ l_hint ] = 0 ;
return prefix ;
2012-04-08 12:02:06 +08:00
}
}
2013-02-24 05:10:48 +08:00
}
bwt_t * bwa_idx_load_bwt ( const char * hint )
{
char * tmp , * prefix ;
bwt_t * bwt ;
prefix = bwa_idx_infer_prefix ( hint ) ;
if ( prefix = = 0 ) {
2013-02-24 05:41:44 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] fail to locate the index files \n " , __func__ ) ;
2013-02-24 05:10:48 +08:00
return 0 ;
}
2013-05-02 22:12:01 +08:00
tmp = calloc ( strlen ( prefix ) + 5 , 1 ) ;
2013-02-24 05:10:48 +08:00
strcat ( strcpy ( tmp , prefix ) , " .bwt " ) ; // FM-index
bwt = bwt_restore_bwt ( tmp ) ;
2024-02-21 15:21:56 +08:00
//strcat(strcpy(tmp, prefix), ".33.4.sa"); // partial suffix array (SA)
strcat ( strcpy ( tmp , prefix ) , " .sa " ) ; // partial suffix array (SA)
2013-02-24 05:10:48 +08:00
bwt_restore_sa ( tmp , bwt ) ;
free ( tmp ) ; free ( prefix ) ;
return bwt ;
}
2024-02-07 22:08:51 +08:00
FMTIndex * bwa_idx_load_fmt ( const char * hint )
{
2024-02-20 01:12:02 +08:00
char * fmt_idx_fn , * kmer_idx_fn , * sa_fn ;
// char *kmer_bit_fn;
2024-02-07 22:08:51 +08:00
FMTIndex * fmt ;
char suffix [ 32 ] ;
int l_hint = strlen ( hint ) ;
fmt_idx_fn = malloc ( l_hint + 32 ) ;
kmer_idx_fn = malloc ( l_hint + 32 ) ;
2024-02-20 01:12:02 +08:00
//kmer_bit_fn = malloc(l_hint + 32);
2024-02-07 22:08:51 +08:00
sa_fn = malloc ( l_hint + 32 ) ;
2024-04-02 07:50:22 +08:00
// sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
sprintf ( suffix , " .fmt " ) ;
2024-02-07 22:08:51 +08:00
strcpy ( fmt_idx_fn , hint ) ;
strcpy ( fmt_idx_fn + l_hint , suffix ) ;
2024-04-02 07:50:22 +08:00
// sprintf(suffix, ".%d.kmer", HASH_KMER_LEN);
sprintf ( suffix , " .kmer " ) ;
2024-02-07 22:08:51 +08:00
strcpy ( kmer_idx_fn , hint ) ;
strcpy ( kmer_idx_fn + l_hint , suffix ) ;
if ( access ( fmt_idx_fn , F_OK ) ! = 0 | | access ( kmer_idx_fn , F_OK ) ! = 0 )
{
if ( bwa_verbose > = 1 )
fprintf ( stderr , " [E::%s] fail to locate the index files \n " , __func__ ) ;
return 0 ;
}
fmt = fmt_restore_fmt ( fmt_idx_fn ) ;
fprintf ( stderr , " %s \n " , kmer_idx_fn ) ;
2024-02-12 20:54:57 +08:00
fmt - > kmer_hash = fmt_restore_kmer_idx ( kmer_idx_fn ) ;
2024-02-07 22:08:51 +08:00
strcpy ( sa_fn , hint ) ;
2024-04-02 07:50:22 +08:00
// sprintf(suffix, ".33.%d.sa", SA_INTV);
sprintf ( suffix , " .bytesa " ) ;
2024-02-07 22:08:51 +08:00
strcpy ( sa_fn + l_hint , suffix ) ; // partial suffix array (SA)
2024-02-21 15:21:56 +08:00
fmt_restore_sa ( sa_fn , fmt ) ;
2024-02-07 22:08:51 +08:00
free ( fmt_idx_fn ) ;
free ( kmer_idx_fn ) ;
free ( sa_fn ) ;
return fmt ;
}
2014-10-16 03:44:06 +08:00
bwaidx_t * bwa_idx_load_from_disk ( const char * hint , int which )
2013-02-24 04:55:55 +08:00
{
bwaidx_t * idx ;
2013-02-24 05:10:48 +08:00
char * prefix ;
prefix = bwa_idx_infer_prefix ( hint ) ;
2013-02-24 05:41:44 +08:00
if ( prefix = = 0 ) {
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] fail to locate the index files \n " , __func__ ) ;
return 0 ;
2012-04-08 12:02:06 +08:00
}
2013-05-02 22:12:01 +08:00
idx = calloc ( 1 , sizeof ( bwaidx_t ) ) ;
2024-03-07 18:23:21 +08:00
2024-02-22 01:26:57 +08:00
if ( which & BWA_IDX_BWT ) idx - > fmt = bwa_idx_load_fmt ( hint ) ;
2024-02-23 01:09:08 +08:00
//if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
//idx->bwt->kmer_hash = idx->fmt->kmer_hash;
2024-02-22 01:26:57 +08:00
2024-02-07 22:08:51 +08:00
if ( which & BWA_IDX_BNS )
{
2014-10-15 11:37:24 +08:00
int i , c ;
2013-02-24 04:55:55 +08:00
idx - > bns = bns_restore ( prefix ) ;
2014-10-15 11:37:24 +08:00
for ( i = c = 0 ; i < idx - > bns - > n_seqs ; + + i )
if ( idx - > bns - > anns [ i ] . is_alt ) + + c ;
if ( bwa_verbose > = 3 )
fprintf ( stderr , " [M::%s] read %d ALT contigs \n " , __func__ , c ) ;
2013-02-24 04:55:55 +08:00
if ( which & BWA_IDX_PAC ) {
2013-05-02 22:12:01 +08:00
idx - > pac = calloc ( idx - > bns - > l_pac / 4 + 1 , 1 ) ;
2013-03-01 17:37:46 +08:00
err_fread_noeof ( idx - > pac , 1 , idx - > bns - > l_pac / 4 + 1 , idx - > bns - > fp_pac ) ; // concatenated 2-bit encoded sequence
err_fclose ( idx - > bns - > fp_pac ) ;
2013-02-24 05:10:48 +08:00
idx - > bns - > fp_pac = 0 ;
2024-02-20 01:12:02 +08:00
// 赋值到fmt中对应的pac
idx - > fmt - > l_pac = idx - > bns - > l_pac ;
idx - > fmt - > pac = idx - > pac ;
2013-02-24 04:55:55 +08:00
}
2012-04-08 12:02:06 +08:00
}
2013-02-24 05:10:48 +08:00
free ( prefix ) ;
2013-02-24 04:55:55 +08:00
return idx ;
2012-04-08 12:02:06 +08:00
}
2014-10-16 03:44:06 +08:00
bwaidx_t * bwa_idx_load ( const char * hint , int which )
{
return bwa_idx_load_from_disk ( hint , which ) ;
}
2013-02-24 04:55:55 +08:00
void bwa_idx_destroy ( bwaidx_t * idx )
2012-04-08 12:02:06 +08:00
{
2013-02-24 04:55:55 +08:00
if ( idx = = 0 ) return ;
2014-10-16 00:27:45 +08:00
if ( idx - > mem = = 0 ) {
if ( idx - > bwt ) bwt_destroy ( idx - > bwt ) ;
if ( idx - > bns ) bns_destroy ( idx - > bns ) ;
if ( idx - > pac ) free ( idx - > pac ) ;
2014-10-16 03:44:06 +08:00
} else {
free ( idx - > bwt ) ; free ( idx - > bns - > anns ) ; free ( idx - > bns ) ;
if ( ! idx - > is_shm ) free ( idx - > mem ) ;
}
2013-02-24 04:55:55 +08:00
free ( idx ) ;
2012-04-08 12:02:06 +08:00
}
2012-04-08 12:55:52 +08:00
2014-10-16 02:44:08 +08:00
int bwa_mem2idx ( int64_t l_mem , uint8_t * mem , bwaidx_t * idx )
2014-10-16 00:27:45 +08:00
{
int64_t k = 0 , x ;
int i ;
// generate idx->bwt
2014-10-16 03:44:06 +08:00
x = sizeof ( bwt_t ) ; idx - > bwt = malloc ( x ) ; memcpy ( idx - > bwt , mem + k , x ) ; k + = x ;
2014-10-16 00:27:45 +08:00
x = idx - > bwt - > bwt_size * 4 ; idx - > bwt - > bwt = ( uint32_t * ) ( mem + k ) ; k + = x ;
2024-02-22 01:26:57 +08:00
x = SA_BYTES ( idx - > bwt - > n_sa ) ; idx - > bwt - > sa = ( bwtint_t * ) ( mem + k ) ; k + = x ;
2014-10-16 00:27:45 +08:00
// generate idx->bns and idx->pac
2014-10-16 03:44:06 +08:00
x = sizeof ( bntseq_t ) ; idx - > bns = malloc ( x ) ; memcpy ( idx - > bns , mem + k , x ) ; k + = x ;
2014-10-16 00:27:45 +08:00
x = idx - > bns - > n_holes * sizeof ( bntamb1_t ) ; idx - > bns - > ambs = ( bntamb1_t * ) ( mem + k ) ; k + = x ;
2014-10-16 03:44:06 +08:00
x = idx - > bns - > n_seqs * sizeof ( bntann1_t ) ; idx - > bns - > anns = malloc ( x ) ; memcpy ( idx - > bns - > anns , mem + k , x ) ; k + = x ;
2014-10-16 00:27:45 +08:00
for ( i = 0 ; i < idx - > bns - > n_seqs ; + + i ) {
idx - > bns - > anns [ i ] . name = ( char * ) ( mem + k ) ; k + = strlen ( idx - > bns - > anns [ i ] . name ) + 1 ;
idx - > bns - > anns [ i ] . anno = ( char * ) ( mem + k ) ; k + = strlen ( idx - > bns - > anns [ i ] . anno ) + 1 ;
}
2014-10-16 03:44:06 +08:00
idx - > pac = ( uint8_t * ) ( mem + k ) ; k + = idx - > bns - > l_pac / 4 + 1 ;
assert ( k = = l_mem ) ;
2014-10-16 00:27:45 +08:00
idx - > l_mem = k ; idx - > mem = mem ;
return 0 ;
}
int bwa_idx2mem ( bwaidx_t * idx )
{
int i ;
2014-10-16 23:22:09 +08:00
int64_t k , x , tmp ;
uint8_t * mem ;
2014-10-16 00:27:45 +08:00
// copy idx->bwt
2014-10-16 23:22:09 +08:00
x = idx - > bwt - > bwt_size * 4 ;
mem = realloc ( idx - > bwt - > bwt , sizeof ( bwt_t ) + x ) ; idx - > bwt - > bwt = 0 ;
memmove ( mem + sizeof ( bwt_t ) , mem , x ) ;
memcpy ( mem , idx - > bwt , sizeof ( bwt_t ) ) ; k = sizeof ( bwt_t ) + x ;
2023-12-27 10:42:12 +08:00
x = SA_BYTES ( idx - > bwt - > n_sa ) ; mem = realloc ( mem , k + x ) ; memcpy ( mem + k , idx - > bwt - > sa , x ) ; k + = x ;
2014-10-16 00:27:45 +08:00
free ( idx - > bwt - > sa ) ;
free ( idx - > bwt ) ; idx - > bwt = 0 ;
// copy idx->bns
tmp = idx - > bns - > n_seqs * sizeof ( bntann1_t ) + idx - > bns - > n_holes * sizeof ( bntamb1_t ) ;
for ( i = 0 ; i < idx - > bns - > n_seqs ; + + i ) // compute the size of heap-allocated memory
tmp + = strlen ( idx - > bns - > anns [ i ] . name ) + strlen ( idx - > bns - > anns [ i ] . anno ) + 2 ;
mem = realloc ( mem , k + sizeof ( bntseq_t ) + tmp ) ;
x = sizeof ( bntseq_t ) ; memcpy ( mem + k , idx - > bns , x ) ; k + = x ;
x = idx - > bns - > n_holes * sizeof ( bntamb1_t ) ; memcpy ( mem + k , idx - > bns - > ambs , x ) ; k + = x ;
free ( idx - > bns - > ambs ) ;
x = idx - > bns - > n_seqs * sizeof ( bntann1_t ) ; memcpy ( mem + k , idx - > bns - > anns , x ) ; k + = x ;
for ( i = 0 ; i < idx - > bns - > n_seqs ; + + i ) {
x = strlen ( idx - > bns - > anns [ i ] . name ) + 1 ; memcpy ( mem + k , idx - > bns - > anns [ i ] . name , x ) ; k + = x ;
x = strlen ( idx - > bns - > anns [ i ] . anno ) + 1 ; memcpy ( mem + k , idx - > bns - > anns [ i ] . anno , x ) ; k + = x ;
free ( idx - > bns - > anns [ i ] . name ) ; free ( idx - > bns - > anns [ i ] . anno ) ;
}
free ( idx - > bns - > anns ) ;
// copy idx->pac
x = idx - > bns - > l_pac / 4 + 1 ;
mem = realloc ( mem , k + x ) ;
memcpy ( mem + k , idx - > pac , x ) ; k + = x ;
free ( idx - > bns ) ; idx - > bns = 0 ;
free ( idx - > pac ) ; idx - > pac = 0 ;
return bwa_mem2idx ( k , mem , idx ) ;
}
2013-02-24 05:41:44 +08:00
/***********************
* SAM header routines *
* * * * * * * * * * * * * * * * * * * * * * */
2014-11-19 23:59:05 +08:00
void bwa_print_sam_hdr ( const bntseq_t * bns , const char * hdr_line )
2013-02-24 05:41:44 +08:00
{
2021-12-14 23:02:05 +08:00
int i , n_HD = 0 , n_SQ = 0 ;
2013-11-20 00:08:45 +08:00
extern char * bwa_pg ;
2021-12-14 23:02:05 +08:00
2014-12-13 05:56:54 +08:00
if ( hdr_line ) {
2021-12-14 23:02:05 +08:00
// check for HD line
2014-12-13 05:56:54 +08:00
const char * p = hdr_line ;
2021-12-14 23:02:05 +08:00
if ( ( p = strstr ( p , " @HD " ) ) ! = 0 ) {
+ + n_HD ;
}
// check for SQ lines
p = hdr_line ;
2014-12-13 05:56:54 +08:00
while ( ( p = strstr ( p , " @SQ \t " ) ) ! = 0 ) {
if ( p = = hdr_line | | * ( p - 1 ) = = ' \n ' ) + + n_SQ ;
p + = 4 ;
}
2014-12-11 23:38:36 +08:00
}
if ( n_SQ = = 0 ) {
2016-04-29 03:39:18 +08:00
for ( i = 0 ; i < bns - > n_seqs ; + + i ) {
err_printf ( " @SQ \t SN:%s \t LN:%d " , bns - > anns [ i ] . name , bns - > anns [ i ] . len ) ;
2016-05-03 23:28:58 +08:00
if ( bns - > anns [ i ] . is_alt ) err_printf ( " \t AH:* \n " ) ;
2016-04-29 03:39:18 +08:00
else err_fputc ( ' \n ' , stdout ) ;
}
2014-12-11 23:38:36 +08:00
} else if ( n_SQ ! = bns - > n_seqs & & bwa_verbose > = 2 )
fprintf ( stderr , " [W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway. \n " , __func__ , n_SQ , bns - > n_seqs ) ;
2021-12-14 23:02:05 +08:00
if ( n_HD = = 0 ) {
err_printf ( " @HD \t VN:1.5 \t SO:unsorted \t GO:query \n " ) ;
}
2014-11-19 23:59:05 +08:00
if ( hdr_line ) err_printf ( " %s \n " , hdr_line ) ;
2014-10-18 04:17:28 +08:00
if ( bwa_pg ) err_printf ( " %s \n " , bwa_pg ) ;
2013-02-24 05:41:44 +08:00
}
2012-04-08 12:55:52 +08:00
2013-02-24 05:41:44 +08:00
static char * bwa_escape ( char * s )
2012-04-08 12:55:52 +08:00
{
2013-02-24 05:41:44 +08:00
char * p , * q ;
for ( p = q = s ; * p ; + + p ) {
if ( * p = = ' \\ ' ) {
+ + p ;
if ( * p = = ' t ' ) * q + + = ' \t ' ;
else if ( * p = = ' n ' ) * q + + = ' \n ' ;
else if ( * p = = ' r ' ) * q + + = ' \r ' ;
else if ( * p = = ' \\ ' ) * q + + = ' \\ ' ;
} else * q + + = * p ;
}
* q = ' \0 ' ;
return s ;
2012-04-08 12:55:52 +08:00
}
2013-02-24 05:41:44 +08:00
char * bwa_set_rg ( const char * s )
2012-04-08 12:55:52 +08:00
{
2013-02-24 05:41:44 +08:00
char * p , * q , * r , * rg_line = 0 ;
memset ( bwa_rg_id , 0 , 256 ) ;
2013-02-24 05:44:02 +08:00
if ( strstr ( s , " @RG " ) ! = s ) {
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] the read group line is not started with @RG \n " , __func__ ) ;
goto err_set_rg ;
}
2016-08-10 05:05:51 +08:00
if ( strstr ( s , " \t " ) ! = NULL ) {
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\ t \n " , __func__ ) ;
goto err_set_rg ;
}
2013-05-02 22:12:01 +08:00
rg_line = strdup ( s ) ;
2013-02-24 05:41:44 +08:00
bwa_escape ( rg_line ) ;
if ( ( p = strstr ( rg_line , " \t ID: " ) ) = = 0 ) {
2016-08-10 05:05:51 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] no ID within the read group line \n " , __func__ ) ;
2013-02-24 05:41:44 +08:00
goto err_set_rg ;
}
p + = 4 ;
for ( q = p ; * q & & * q ! = ' \t ' & & * q ! = ' \n ' ; + + q ) ;
if ( q - p + 1 > 256 ) {
2013-02-24 05:44:02 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] @RG:ID is longer than 255 characters \n " , __func__ ) ;
2013-02-24 05:41:44 +08:00
goto err_set_rg ;
}
for ( q = p , r = bwa_rg_id ; * q & & * q ! = ' \t ' & & * q ! = ' \n ' ; + + q )
* r + + = * q ;
return rg_line ;
err_set_rg :
free ( rg_line ) ;
return 0 ;
2012-04-08 12:55:52 +08:00
}
2013-02-24 05:41:44 +08:00
2014-11-19 23:59:05 +08:00
char * bwa_insert_header ( const char * s , char * hdr )
{
int len = 0 ;
if ( s = = 0 | | s [ 0 ] ! = ' @ ' ) return hdr ;
if ( hdr ) {
len = strlen ( hdr ) ;
hdr = realloc ( hdr , len + strlen ( s ) + 2 ) ;
hdr [ len + + ] = ' \n ' ;
strcpy ( hdr + len , s ) ;
} else hdr = strdup ( s ) ;
bwa_escape ( hdr + len ) ;
return hdr ;
}