2012-04-07 12:23:01 +08:00
# include <string.h>
# include <stdio.h>
2013-02-24 04:30:46 +08:00
# include <zlib.h>
2013-02-26 13:03:49 +08:00
# include <assert.h>
2012-04-07 12:23:01 +08:00
# include "bntseq.h"
2013-02-24 04:30:46 +08:00
# include "bwa.h"
# include "ksw.h"
2013-02-24 05:41:44 +08:00
# include "utils.h"
2014-01-30 01:05:11 +08:00
# include "kstring.h"
2012-04-07 12:23:01 +08:00
2013-05-02 22:12:01 +08:00
# ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
# endif
2013-02-24 05:10:48 +08:00
int bwa_verbose = 3 ;
2013-02-24 05:41:44 +08:00
char bwa_rg_id [ 256 ] ;
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
/************************
* Batch FASTA / Q reader *
* * * * * * * * * * * * * * * * * * * * * * * */
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
# include "kseq.h"
KSEQ_DECLARE ( gzFile )
2012-04-07 12:23:01 +08:00
2013-02-24 04:30:46 +08:00
static inline void trim_readno ( kstring_t * s )
2012-04-07 12:23:01 +08:00
{
2013-02-24 04:30:46 +08:00
if ( s - > l > 2 & & s - > s [ s - > l - 2 ] = = ' / ' & & isdigit ( s - > s [ s - > l - 1 ] ) )
s - > l - = 2 , s - > s [ s - > l ] = 0 ;
2012-04-07 12:23:01 +08:00
}
2013-02-24 04:30:46 +08:00
static inline void kseq2bseq1 ( const kseq_t * ks , bseq1_t * s )
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
2013-05-02 22:12:01 +08:00
s - > name = strdup ( ks - > name . s ) ;
s - > comment = ks - > comment . l ? strdup ( ks - > comment . s ) : 0 ;
s - > seq = strdup ( ks - > seq . s ) ;
s - > qual = ks - > qual . l ? strdup ( ks - > qual . s ) : 0 ;
2013-02-24 04:30:46 +08:00
s - > l_seq = strlen ( s - > seq ) ;
2012-04-07 12:23:01 +08:00
}
2013-02-24 04:30:46 +08:00
bseq1_t * bseq_read ( int chunk_size , int * n_ , void * ks1_ , void * ks2_ )
2012-04-07 12:23:01 +08:00
{
2013-02-24 04:30:46 +08:00
kseq_t * ks = ( kseq_t * ) ks1_ , * ks2 = ( kseq_t * ) ks2_ ;
int size = 0 , m , n ;
bseq1_t * seqs ;
m = n = 0 ; seqs = 0 ;
while ( kseq_read ( ks ) > = 0 ) {
if ( ks2 & & kseq_read ( ks2 ) < 0 ) { // the 2nd file has fewer reads
fprintf ( stderr , " [W::%s] the 2nd file has fewer sequences. \n " , __func__ ) ;
break ;
}
if ( n > = m ) {
m = m ? m < < 1 : 256 ;
2013-05-02 22:12:01 +08:00
seqs = realloc ( seqs , m * sizeof ( bseq1_t ) ) ;
2013-02-24 04:30:46 +08:00
}
trim_readno ( & ks - > name ) ;
kseq2bseq1 ( ks , & seqs [ n ] ) ;
size + = seqs [ n + + ] . l_seq ;
if ( ks2 ) {
trim_readno ( & ks2 - > name ) ;
kseq2bseq1 ( ks2 , & seqs [ n ] ) ;
size + = seqs [ n + + ] . l_seq ;
}
2013-03-08 00:00:15 +08:00
if ( size > = chunk_size & & ( n & 1 ) = = 0 ) break ;
2013-02-24 04:30:46 +08:00
}
if ( size = = 0 ) { // test if the 2nd file is finished
if ( ks2 & & kseq_read ( ks2 ) > = 0 )
fprintf ( stderr , " [W::%s] the 1st file has fewer sequences. \n " , __func__ ) ;
}
* n_ = n ;
return seqs ;
2012-04-07 12:23:01 +08:00
}
2013-02-26 11:49:15 +08:00
/*****************
* CIGAR related *
* * * * * * * * * * * * * * * * */
2012-04-07 12:23:01 +08:00
2013-03-05 22:38:12 +08:00
void bwa_fill_scmat ( int a , int b , int8_t mat [ 25 ] )
{
int i , j , k ;
for ( i = k = 0 ; i < 4 ; + + i ) {
for ( j = 0 ; j < 4 ; + + j )
mat [ k + + ] = i = = j ? a : - b ;
2013-03-10 07:03:57 +08:00
mat [ k + + ] = - 1 ; // ambiguous base
2013-03-05 22:38:12 +08:00
}
2013-03-10 07:03:57 +08:00
for ( j = 0 ; j < 5 ; + + j ) mat [ k + + ] = - 1 ;
2013-03-05 22:38:12 +08:00
}
2013-02-24 04:30:46 +08:00
// Generate CIGAR when the alignment end points are known
2014-03-29 02:54:06 +08:00
uint32_t * bwa_gen_cigar2 ( const int8_t mat [ 25 ] , int o_del , int e_del , int o_ins , int e_ins , int w_ , int64_t l_pac , const uint8_t * pac , int l_query , uint8_t * query , int64_t rb , int64_t re , int * score , int * n_cigar , int * NM )
2012-04-07 12:23:01 +08:00
{
2013-02-24 04:30:46 +08:00
uint32_t * cigar = 0 ;
uint8_t tmp , * rseq ;
2013-02-28 12:59:50 +08:00
int i ;
2013-02-24 04:30:46 +08:00
int64_t rlen ;
2014-01-30 01:05:11 +08:00
kstring_t str ;
2014-02-18 23:32:24 +08:00
const char * int2base ;
2014-01-30 01:05:11 +08:00
2014-04-17 04:38:50 +08:00
if ( n_cigar ) * n_cigar = 0 ;
if ( NM ) * NM = - 1 ;
2013-02-24 04:30:46 +08:00
if ( l_query < = 0 | | rb > = re | | ( rb < l_pac & & re > l_pac ) ) return 0 ; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq ( l_pac , pac , rb , re , & rlen ) ;
if ( re - rb ! = rlen ) goto ret_gen_cigar ; // possible if out of range
if ( rb > = l_pac ) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
for ( i = 0 ; i < l_query > > 1 ; + + i )
tmp = query [ i ] , query [ i ] = query [ l_query - 1 - i ] , query [ l_query - 1 - i ] = tmp ;
for ( i = 0 ; i < rlen > > 1 ; + + i )
tmp = rseq [ i ] , rseq [ i ] = rseq [ rlen - 1 - i ] , rseq [ rlen - 1 - i ] = tmp ;
2012-04-07 12:23:01 +08:00
}
2013-02-28 12:59:50 +08:00
if ( l_query = = re - rb & & w_ = = 0 ) { // no gap; no need to do DP
2014-05-01 23:13:05 +08:00
// UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
if ( n_cigar ) {
cigar = malloc ( 4 ) ;
cigar [ 0 ] = l_query < < 4 | 0 ;
* n_cigar = 1 ;
}
2013-02-28 12:59:50 +08:00
for ( i = 0 , * score = 0 ; i < l_query ; + + i )
* score + = mat [ rseq [ i ] * 5 + query [ i ] ] ;
} else {
2014-03-29 02:54:06 +08:00
int w , max_gap , max_ins , max_del , min_w ;
2013-02-28 12:59:50 +08:00
// set the band-width
2014-03-29 02:54:06 +08:00
max_ins = ( int ) ( ( double ) ( ( ( l_query + 1 ) > > 1 ) * mat [ 0 ] - o_ins ) / e_ins + 1. ) ;
max_del = ( int ) ( ( double ) ( ( ( l_query + 1 ) > > 1 ) * mat [ 0 ] - o_del ) / e_del + 1. ) ;
max_gap = max_ins > max_del ? max_ins : max_del ;
2013-02-28 12:59:50 +08:00
max_gap = max_gap > 1 ? max_gap : 1 ;
w = ( max_gap + abs ( rlen - l_query ) + 1 ) > > 1 ;
w = w < w_ ? w : w_ ;
min_w = abs ( rlen - l_query ) + 3 ;
w = w > min_w ? w : min_w ;
// NW alignment
2014-03-17 11:25:04 +08:00
if ( bwa_verbose > = 4 ) {
printf ( " * Global bandwidth: %d \n " , w ) ;
printf ( " * Global ref: " ) ; for ( i = 0 ; i < rlen ; + + i ) putchar ( " ACGTN " [ ( int ) rseq [ i ] ] ) ; putchar ( ' \n ' ) ;
printf ( " * Global query: " ) ; for ( i = 0 ; i < l_query ; + + i ) putchar ( " ACGTN " [ ( int ) query [ i ] ] ) ; putchar ( ' \n ' ) ;
}
2014-03-29 02:54:06 +08:00
* score = ksw_global2 ( l_query , query , rlen , rseq , 5 , mat , o_del , e_del , o_ins , e_ins , w , n_cigar , & cigar ) ;
2013-02-28 12:59:50 +08:00
}
2014-04-17 04:38:50 +08:00
if ( NM & & n_cigar ) { // compute NM and MD
2014-01-30 01:05:11 +08:00
int k , x , y , u , n_mm = 0 , n_gap = 0 ;
str . l = str . m = * n_cigar * 4 ; str . s = ( char * ) cigar ; // append MD to CIGAR
2014-02-18 23:32:24 +08:00
int2base = rb < l_pac ? " ACGTN " : " TGCAN " ;
2014-01-30 01:05:11 +08:00
for ( k = 0 , x = y = u = 0 ; k < * n_cigar ; + + k ) {
2014-02-01 01:58:21 +08:00
int op , len ;
cigar = ( uint32_t * ) str . s ;
op = cigar [ k ] & 0xf , len = cigar [ k ] > > 4 ;
2013-02-27 02:36:01 +08:00
if ( op = = 0 ) { // match
2014-01-30 01:05:11 +08:00
for ( i = 0 ; i < len ; + + i ) {
if ( query [ x + i ] ! = rseq [ y + i ] ) {
2014-02-18 23:32:24 +08:00
kputw ( u , & str ) ;
kputc ( int2base [ rseq [ y + i ] ] , & str ) ;
2014-01-30 01:05:11 +08:00
+ + n_mm ; u = 0 ;
} else + + u ;
}
2013-02-27 02:36:01 +08:00
x + = len ; y + = len ;
2014-01-30 01:05:11 +08:00
} else if ( op = = 2 ) { // deletion
2014-02-21 02:06:40 +08:00
if ( k > 0 & & k < * n_cigar - 1 ) { // don't do the following if D is the first or the last CIGAR
2014-02-20 00:31:54 +08:00
kputw ( u , & str ) ; kputc ( ' ^ ' , & str ) ;
for ( i = 0 ; i < len ; + + i )
kputc ( int2base [ rseq [ y + i ] ] , & str ) ;
u = 0 ; n_gap + = len ;
}
y + = len ;
2014-01-30 01:05:11 +08:00
} else if ( op = = 1 ) x + = len , n_gap + = len ; // insertion
2012-04-08 10:00:03 +08:00
}
2014-01-30 01:05:11 +08:00
kputw ( u , & str ) ; kputc ( 0 , & str ) ;
2013-02-27 02:36:01 +08:00
* NM = n_mm + n_gap ;
2014-01-30 01:05:11 +08:00
cigar = ( uint32_t * ) str . s ;
2012-04-08 10:00:03 +08:00
}
2013-02-24 04:30:46 +08:00
if ( rb > = l_pac ) // reverse back query
for ( i = 0 ; i < l_query > > 1 ; + + i )
tmp = query [ i ] , query [ i ] = query [ l_query - 1 - i ] , query [ l_query - 1 - i ] = tmp ;
ret_gen_cigar :
free ( rseq ) ;
return cigar ;
2012-04-08 10:00:03 +08:00
}
2014-03-29 02:54:06 +08:00
uint32_t * bwa_gen_cigar ( const int8_t mat [ 25 ] , int q , int r , int w_ , int64_t l_pac , const uint8_t * pac , int l_query , uint8_t * query , int64_t rb , int64_t re , int * score , int * n_cigar , int * NM )
{
return bwa_gen_cigar2 ( mat , q , r , q , r , w_ , l_pac , pac , l_query , query , rb , re , score , n_cigar , NM ) ;
}
2013-02-24 04:55:55 +08:00
/*********************
* Full index reader *
* * * * * * * * * * * * * * * * * * * * */
2012-04-08 12:55:52 +08:00
2013-02-24 05:10:48 +08:00
char * bwa_idx_infer_prefix ( const char * hint )
2012-04-08 12:02:06 +08:00
{
2013-02-24 05:10:48 +08:00
char * prefix ;
int l_hint ;
FILE * fp ;
l_hint = strlen ( hint ) ;
2013-05-02 22:12:01 +08:00
prefix = malloc ( l_hint + 3 + 4 + 1 ) ;
2013-02-24 05:10:48 +08:00
strcpy ( prefix , hint ) ;
strcpy ( prefix + l_hint , " .64.bwt " ) ;
if ( ( fp = fopen ( prefix , " rb " ) ) ! = 0 ) {
fclose ( fp ) ;
prefix [ l_hint + 3 ] = 0 ;
return prefix ;
} else {
strcpy ( prefix + l_hint , " .bwt " ) ;
if ( ( fp = fopen ( prefix , " rb " ) ) = = 0 ) {
free ( prefix ) ;
return 0 ;
} else {
fclose ( fp ) ;
prefix [ l_hint ] = 0 ;
return prefix ;
2012-04-08 12:02:06 +08:00
}
}
2013-02-24 05:10:48 +08:00
}
bwt_t * bwa_idx_load_bwt ( const char * hint )
{
char * tmp , * prefix ;
bwt_t * bwt ;
prefix = bwa_idx_infer_prefix ( hint ) ;
if ( prefix = = 0 ) {
2013-02-24 05:41:44 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] fail to locate the index files \n " , __func__ ) ;
2013-02-24 05:10:48 +08:00
return 0 ;
}
2013-05-02 22:12:01 +08:00
tmp = calloc ( strlen ( prefix ) + 5 , 1 ) ;
2013-02-24 05:10:48 +08:00
strcat ( strcpy ( tmp , prefix ) , " .bwt " ) ; // FM-index
bwt = bwt_restore_bwt ( tmp ) ;
strcat ( strcpy ( tmp , prefix ) , " .sa " ) ; // partial suffix array (SA)
bwt_restore_sa ( tmp , bwt ) ;
free ( tmp ) ; free ( prefix ) ;
return bwt ;
}
bwaidx_t * bwa_idx_load ( const char * hint , int which )
2013-02-24 04:55:55 +08:00
{
bwaidx_t * idx ;
2013-02-24 05:10:48 +08:00
char * prefix ;
prefix = bwa_idx_infer_prefix ( hint ) ;
2013-02-24 05:41:44 +08:00
if ( prefix = = 0 ) {
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] fail to locate the index files \n " , __func__ ) ;
return 0 ;
2012-04-08 12:02:06 +08:00
}
2013-05-02 22:12:01 +08:00
idx = calloc ( 1 , sizeof ( bwaidx_t ) ) ;
2013-02-24 05:10:48 +08:00
if ( which & BWA_IDX_BWT ) idx - > bwt = bwa_idx_load_bwt ( hint ) ;
2013-02-24 04:55:55 +08:00
if ( which & BWA_IDX_BNS ) {
2014-10-15 11:37:24 +08:00
int i , c ;
2013-02-24 04:55:55 +08:00
idx - > bns = bns_restore ( prefix ) ;
2014-10-15 11:37:24 +08:00
for ( i = c = 0 ; i < idx - > bns - > n_seqs ; + + i )
if ( idx - > bns - > anns [ i ] . is_alt ) + + c ;
if ( bwa_verbose > = 3 )
fprintf ( stderr , " [M::%s] read %d ALT contigs \n " , __func__ , c ) ;
2013-02-24 04:55:55 +08:00
if ( which & BWA_IDX_PAC ) {
2013-05-02 22:12:01 +08:00
idx - > pac = calloc ( idx - > bns - > l_pac / 4 + 1 , 1 ) ;
2013-03-01 17:37:46 +08:00
err_fread_noeof ( idx - > pac , 1 , idx - > bns - > l_pac / 4 + 1 , idx - > bns - > fp_pac ) ; // concatenated 2-bit encoded sequence
err_fclose ( idx - > bns - > fp_pac ) ;
2013-02-24 05:10:48 +08:00
idx - > bns - > fp_pac = 0 ;
2013-02-24 04:55:55 +08:00
}
2012-04-08 12:02:06 +08:00
}
2013-02-24 05:10:48 +08:00
free ( prefix ) ;
2013-02-24 04:55:55 +08:00
return idx ;
2012-04-08 12:02:06 +08:00
}
2013-02-24 04:55:55 +08:00
void bwa_idx_destroy ( bwaidx_t * idx )
2012-04-08 12:02:06 +08:00
{
2013-02-24 04:55:55 +08:00
if ( idx = = 0 ) return ;
if ( idx - > bwt ) bwt_destroy ( idx - > bwt ) ;
if ( idx - > bns ) bns_destroy ( idx - > bns ) ;
if ( idx - > pac ) free ( idx - > pac ) ;
free ( idx ) ;
2012-04-08 12:02:06 +08:00
}
2012-04-08 12:55:52 +08:00
2013-02-24 05:41:44 +08:00
/***********************
* SAM header routines *
* * * * * * * * * * * * * * * * * * * * * * */
void bwa_print_sam_hdr ( const bntseq_t * bns , const char * rg_line )
{
int i ;
2013-11-20 00:08:45 +08:00
extern char * bwa_pg ;
2013-02-24 05:41:44 +08:00
for ( i = 0 ; i < bns - > n_seqs ; + + i )
err_printf ( " @SQ \t SN:%s \t LN:%d \n " , bns - > anns [ i ] . name , bns - > anns [ i ] . len ) ;
if ( rg_line ) err_printf ( " %s \n " , rg_line ) ;
2013-11-20 00:08:45 +08:00
err_printf ( " %s \n " , bwa_pg ) ;
2013-02-24 05:41:44 +08:00
}
2012-04-08 12:55:52 +08:00
2013-02-24 05:41:44 +08:00
static char * bwa_escape ( char * s )
2012-04-08 12:55:52 +08:00
{
2013-02-24 05:41:44 +08:00
char * p , * q ;
for ( p = q = s ; * p ; + + p ) {
if ( * p = = ' \\ ' ) {
+ + p ;
if ( * p = = ' t ' ) * q + + = ' \t ' ;
else if ( * p = = ' n ' ) * q + + = ' \n ' ;
else if ( * p = = ' r ' ) * q + + = ' \r ' ;
else if ( * p = = ' \\ ' ) * q + + = ' \\ ' ;
} else * q + + = * p ;
}
* q = ' \0 ' ;
return s ;
2012-04-08 12:55:52 +08:00
}
2013-02-24 05:41:44 +08:00
char * bwa_set_rg ( const char * s )
2012-04-08 12:55:52 +08:00
{
2013-02-24 05:41:44 +08:00
char * p , * q , * r , * rg_line = 0 ;
memset ( bwa_rg_id , 0 , 256 ) ;
2013-02-24 05:44:02 +08:00
if ( strstr ( s , " @RG " ) ! = s ) {
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] the read group line is not started with @RG \n " , __func__ ) ;
goto err_set_rg ;
}
2013-05-02 22:12:01 +08:00
rg_line = strdup ( s ) ;
2013-02-24 05:41:44 +08:00
bwa_escape ( rg_line ) ;
if ( ( p = strstr ( rg_line , " \t ID: " ) ) = = 0 ) {
2013-02-24 05:44:02 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] no ID at the read group line \n " , __func__ ) ;
2013-02-24 05:41:44 +08:00
goto err_set_rg ;
}
p + = 4 ;
for ( q = p ; * q & & * q ! = ' \t ' & & * q ! = ' \n ' ; + + q ) ;
if ( q - p + 1 > 256 ) {
2013-02-24 05:44:02 +08:00
if ( bwa_verbose > = 1 ) fprintf ( stderr , " [E::%s] @RG:ID is longer than 255 characters \n " , __func__ ) ;
2013-02-24 05:41:44 +08:00
goto err_set_rg ;
}
for ( q = p , r = bwa_rg_id ; * q & & * q ! = ' \t ' & & * q ! = ' \n ' ; + + q )
* r + + = * q ;
return rg_line ;
err_set_rg :
free ( rg_line ) ;
return 0 ;
2012-04-08 12:55:52 +08:00
}
2013-02-24 05:41:44 +08:00