2017-04-08 03:30:30 +08:00
# include <stdlib.h>
# include <assert.h>
2017-09-03 05:52:33 +08:00
# if defined(WIN32) || defined(_WIN32)
# include <io.h> // for open(2)
2017-08-30 23:25:20 +08:00
# else
2017-07-19 22:25:11 +08:00
# include <unistd.h>
2017-08-30 23:25:20 +08:00
# endif
2017-07-19 22:25:11 +08:00
# include <fcntl.h>
2017-04-08 03:30:30 +08:00
# include <stdio.h>
2018-01-19 23:40:18 +08:00
# define __STDC_LIMIT_MACROS
2017-04-08 03:30:30 +08:00
# include "kthread.h"
# include "bseq.h"
# include "minimap.h"
2017-06-06 22:16:33 +08:00
# include "mmpriv.h"
2017-04-08 03:30:30 +08:00
# include "kvec.h"
# include "khash.h"
# define idx_hash(a) ((a)>>1)
# define idx_eq(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT ( idx , uint64_t , uint64_t , 1 , idx_hash , idx_eq )
typedef khash_t ( idx ) idxhash_t ;
2018-02-23 23:18:26 +08:00
KHASH_MAP_INIT_STR ( str , uint32_t )
2017-04-08 03:30:30 +08:00
# define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
2017-09-15 09:18:13 +08:00
typedef struct mm_idx_bucket_s {
mm128_v a ; // (minimizer, position) array
int32_t n ; // size of the _p_ array
uint64_t * p ; // position array for minimizers appearing >1 times
void * h ; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t ;
2019-04-29 02:52:47 +08:00
typedef struct {
int32_t st , en , max ; // max is not used for now
int32_t score : 30 , strand : 2 ;
} mm_idx_intv1_t ;
typedef struct mm_idx_intv_s {
int32_t n , m ;
mm_idx_intv1_t * a ;
} mm_idx_intv_t ;
2017-11-12 08:54:06 +08:00
mm_idx_t * mm_idx_init ( int w , int k , int b , int flag )
2017-04-08 03:30:30 +08:00
{
mm_idx_t * mi ;
if ( k * 2 < b ) b = k * 2 ;
if ( w < 1 ) w = 1 ;
mi = ( mm_idx_t * ) calloc ( 1 , sizeof ( mm_idx_t ) ) ;
2017-11-12 08:54:06 +08:00
mi - > w = w , mi - > k = k , mi - > b = b , mi - > flag = flag ;
2017-04-08 03:30:30 +08:00
mi - > B = ( mm_idx_bucket_t * ) calloc ( 1 < < b , sizeof ( mm_idx_bucket_t ) ) ;
2017-07-25 07:36:05 +08:00
if ( ! ( mm_dbg_flag & 1 ) ) mi - > km = km_init ( ) ;
2017-04-08 03:30:30 +08:00
return mi ;
}
void mm_idx_destroy ( mm_idx_t * mi )
{
2018-06-20 03:26:58 +08:00
uint32_t i ;
2017-04-08 03:30:30 +08:00
if ( mi = = 0 ) return ;
2018-02-23 23:18:26 +08:00
if ( mi - > h ) kh_destroy ( str , ( khash_t ( str ) * ) mi - > h ) ;
2018-07-16 10:11:32 +08:00
if ( mi - > B ) {
for ( i = 0 ; i < 1U < < mi - > b ; + + i ) {
free ( mi - > B [ i ] . p ) ;
free ( mi - > B [ i ] . a . a ) ;
kh_destroy ( idx , ( idxhash_t * ) mi - > B [ i ] . h ) ;
}
2017-04-08 03:30:30 +08:00
}
2019-04-29 02:52:47 +08:00
if ( mi - > I ) {
for ( i = 0 ; i < mi - > n_seq ; + + i )
free ( mi - > I [ i ] . a ) ;
free ( mi - > I ) ;
}
2017-07-25 07:36:05 +08:00
if ( ! mi - > km ) {
for ( i = 0 ; i < mi - > n_seq ; + + i )
free ( mi - > seq [ i ] . name ) ;
free ( mi - > seq ) ;
} else km_destroy ( mi - > km ) ;
free ( mi - > B ) ; free ( mi - > S ) ; free ( mi ) ;
2017-04-08 03:30:30 +08:00
}
const uint64_t * mm_idx_get ( const mm_idx_t * mi , uint64_t minier , int * n )
{
int mask = ( 1 < < mi - > b ) - 1 ;
khint_t k ;
mm_idx_bucket_t * b = & mi - > B [ minier & mask ] ;
idxhash_t * h = ( idxhash_t * ) b - > h ;
* n = 0 ;
if ( h = = 0 ) return 0 ;
k = kh_get ( idx , h , minier > > mi - > b < < 1 ) ;
if ( k = = kh_end ( h ) ) return 0 ;
2017-04-13 01:31:12 +08:00
if ( kh_key ( h , k ) & 1 ) { // special casing when there is only one k-mer
2017-04-08 03:30:30 +08:00
* n = 1 ;
return & kh_val ( h , k ) ;
} else {
* n = ( uint32_t ) kh_val ( h , k ) ;
return & b - > p [ kh_val ( h , k ) > > 32 ] ;
}
}
2017-04-26 22:52:28 +08:00
void mm_idx_stat ( const mm_idx_t * mi )
{
2018-06-20 03:26:58 +08:00
int n = 0 , n1 = 0 ;
uint32_t i ;
2017-04-26 22:52:28 +08:00
uint64_t sum = 0 , len = 0 ;
2017-11-12 08:54:06 +08:00
fprintf ( stderr , " [M::%s] kmer size: %d; skip: %d; is_hpc: %d; #seq: %d \n " , __func__ , mi - > k , mi - > w , mi - > flag & MM_I_HPC , mi - > n_seq ) ;
2017-04-26 22:52:28 +08:00
for ( i = 0 ; i < mi - > n_seq ; + + i )
len + = mi - > seq [ i ] . len ;
2018-06-20 03:26:58 +08:00
for ( i = 0 ; i < 1U < < mi - > b ; + + i )
2017-04-26 22:52:28 +08:00
if ( mi - > B [ i ] . h ) n + = kh_size ( ( idxhash_t * ) mi - > B [ i ] . h ) ;
2018-06-20 03:26:58 +08:00
for ( i = 0 ; i < 1U < < mi - > b ; + + i ) {
2017-04-26 22:52:28 +08:00
idxhash_t * h = ( idxhash_t * ) mi - > B [ i ] . h ;
khint_t k ;
if ( h = = 0 ) continue ;
for ( k = 0 ; k < kh_end ( h ) ; + + k )
2017-05-09 22:20:57 +08:00
if ( kh_exist ( h , k ) ) {
2017-04-26 22:52:28 +08:00
sum + = kh_key ( h , k ) & 1 ? 1 : ( uint32_t ) kh_val ( h , k ) ;
2017-05-09 22:20:57 +08:00
if ( kh_key ( h , k ) & 1 ) + + n1 ;
}
2017-04-26 22:52:28 +08:00
}
2020-01-07 10:13:33 +08:00
fprintf ( stderr , " [M::%s::%.3f*%.2f] distinct minimizers: %d (%.2f%% are singletons); average occurrences: %.3lf; average spacing: %.3lf; total length: %ld \n " ,
__func__ , realtime ( ) - mm_realtime0 , cputime ( ) / ( realtime ( ) - mm_realtime0 ) , n , 100.0 * n1 / n , ( double ) sum / n , ( double ) len / sum , ( long ) len ) ;
2017-04-26 22:52:28 +08:00
}
2018-02-23 23:18:26 +08:00
int mm_idx_index_name ( mm_idx_t * mi )
{
khash_t ( str ) * h ;
uint32_t i ;
int has_dup = 0 , absent ;
if ( mi - > h ) return 0 ;
h = kh_init ( str ) ;
for ( i = 0 ; i < mi - > n_seq ; + + i ) {
khint_t k ;
k = kh_put ( str , h , mi - > seq [ i ] . name , & absent ) ;
if ( absent ) kh_val ( h , k ) = i ;
else has_dup = 1 ;
}
mi - > h = h ;
if ( has_dup & & mm_verbose > = 2 )
fprintf ( stderr , " [WARNING] some database sequences have identical sequence names \n " ) ;
return has_dup ;
}
int mm_idx_name2id ( const mm_idx_t * mi , const char * name )
{
khash_t ( str ) * h = ( khash_t ( str ) * ) mi - > h ;
khint_t k ;
if ( h = = 0 ) return - 2 ;
k = kh_get ( str , h , name ) ;
return k = = kh_end ( h ) ? - 1 : kh_val ( h , k ) ;
}
2017-06-24 01:44:45 +08:00
int mm_idx_getseq ( const mm_idx_t * mi , uint32_t rid , uint32_t st , uint32_t en , uint8_t * seq )
{
uint64_t i , st1 , en1 ;
if ( rid > = mi - > n_seq | | st > = mi - > seq [ rid ] . len ) return - 1 ;
if ( en > mi - > seq [ rid ] . len ) en = mi - > seq [ rid ] . len ;
st1 = mi - > seq [ rid ] . offset + st ;
en1 = mi - > seq [ rid ] . offset + en ;
for ( i = st1 ; i < en1 ; + + i )
seq [ i - st1 ] = mm_seq4_get ( mi - > S , i ) ;
return en - st ;
}
2017-09-13 23:37:00 +08:00
int32_t mm_idx_cal_max_occ ( const mm_idx_t * mi , float f )
2017-04-08 03:30:30 +08:00
{
int i ;
size_t n = 0 ;
uint32_t thres ;
khint_t * a , k ;
2017-09-13 23:37:00 +08:00
if ( f < = 0. ) return INT32_MAX ;
2017-04-08 03:30:30 +08:00
for ( i = 0 ; i < 1 < < mi - > b ; + + i )
if ( mi - > B [ i ] . h ) n + = kh_size ( ( idxhash_t * ) mi - > B [ i ] . h ) ;
a = ( uint32_t * ) malloc ( n * 4 ) ;
for ( i = n = 0 ; i < 1 < < mi - > b ; + + i ) {
idxhash_t * h = ( idxhash_t * ) mi - > B [ i ] . h ;
if ( h = = 0 ) continue ;
for ( k = 0 ; k < kh_end ( h ) ; + + k ) {
if ( ! kh_exist ( h , k ) ) continue ;
a [ n + + ] = kh_key ( h , k ) & 1 ? 1 : ( uint32_t ) kh_val ( h , k ) ;
}
}
thres = ks_ksmall_uint32_t ( n , a , ( uint32_t ) ( ( 1. - f ) * n ) ) + 1 ;
free ( a ) ;
return thres ;
}
/*********************************
* Sort and generate hash tables *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static void worker_post ( void * g , long i , int tid )
{
2018-06-20 03:26:58 +08:00
int n , n_keys ;
size_t j , start_a , start_p ;
2017-04-08 03:30:30 +08:00
idxhash_t * h ;
mm_idx_t * mi = ( mm_idx_t * ) g ;
mm_idx_bucket_t * b = & mi - > B [ i ] ;
if ( b - > a . n = = 0 ) return ;
// sort by minimizer
radix_sort_128x ( b - > a . a , b - > a . a + b - > a . n ) ;
// count and preallocate
for ( j = 1 , n = 1 , n_keys = 0 , b - > n = 0 ; j < = b - > a . n ; + + j ) {
if ( j = = b - > a . n | | b - > a . a [ j ] . x > > 8 ! = b - > a . a [ j - 1 ] . x > > 8 ) {
+ + n_keys ;
if ( n > 1 ) b - > n + = n ;
n = 1 ;
} else + + n ;
}
h = kh_init ( idx ) ;
kh_resize ( idx , h , n_keys ) ;
b - > p = ( uint64_t * ) calloc ( b - > n , 8 ) ;
// create the hash table
for ( j = 1 , n = 1 , start_a = start_p = 0 ; j < = b - > a . n ; + + j ) {
if ( j = = b - > a . n | | b - > a . a [ j ] . x > > 8 ! = b - > a . a [ j - 1 ] . x > > 8 ) {
khint_t itr ;
int absent ;
mm128_t * p = & b - > a . a [ j - 1 ] ;
itr = kh_put ( idx , h , p - > x > > 8 > > mi - > b < < 1 , & absent ) ;
2018-06-20 03:26:58 +08:00
assert ( absent & & j = = start_a + n ) ;
2017-04-08 03:30:30 +08:00
if ( n = = 1 ) {
kh_key ( h , itr ) | = 1 ;
kh_val ( h , itr ) = p - > y ;
} else {
int k ;
for ( k = 0 ; k < n ; + + k )
b - > p [ start_p + k ] = b - > a . a [ start_a + k ] . y ;
2017-04-13 01:31:12 +08:00
radix_sort_64 ( & b - > p [ start_p ] , & b - > p [ start_p + n ] ) ; // sort by position; needed as in-place radix_sort_128x() is not stable
2017-04-08 03:30:30 +08:00
kh_val ( h , itr ) = ( uint64_t ) start_p < < 32 | n ;
start_p + = n ;
}
start_a = j , n = 1 ;
} else + + n ;
}
b - > h = h ;
2018-06-20 03:26:58 +08:00
assert ( b - > n = = ( int32_t ) start_p ) ;
2017-04-08 03:30:30 +08:00
// deallocate and clear b->a
2017-09-23 22:43:22 +08:00
kfree ( 0 , b - > a . a ) ;
2017-04-08 03:30:30 +08:00
b - > a . n = b - > a . m = 0 , b - > a . a = 0 ;
}
static void mm_idx_post ( mm_idx_t * mi , int n_threads )
{
kt_for ( n_threads , worker_post , mi , 1 < < mi - > b ) ;
}
/******************
* Generate index *
* * * * * * * * * * * * * * * * * */
# include <string.h>
# include <zlib.h>
# include "bseq.h"
typedef struct {
2017-11-12 08:54:06 +08:00
int mini_batch_size ;
2017-04-19 23:06:24 +08:00
uint64_t batch_size , sum_len ;
2017-07-19 21:26:46 +08:00
mm_bseq_file_t * fp ;
2017-04-08 03:30:30 +08:00
mm_idx_t * mi ;
} pipeline_t ;
typedef struct {
int n_seq ;
2017-07-19 21:26:46 +08:00
mm_bseq1_t * seq ;
2017-04-08 03:30:30 +08:00
mm128_v a ;
} step_t ;
static void mm_idx_add ( mm_idx_t * mi , int n , const mm128_t * a )
{
int i , mask = ( 1 < < mi - > b ) - 1 ;
for ( i = 0 ; i < n ; + + i ) {
2017-04-10 02:17:37 +08:00
mm128_v * p = & mi - > B [ a [ i ] . x > > 8 & mask ] . a ;
2017-04-08 03:30:30 +08:00
kv_push ( mm128_t , 0 , * p , a [ i ] ) ;
}
}
static void * worker_pipeline ( void * shared , int step , void * in )
{
int i ;
pipeline_t * p = ( pipeline_t * ) shared ;
if ( step = = 0 ) { // step 0: read sequences
step_t * s ;
2017-04-19 23:06:24 +08:00
if ( p - > sum_len > p - > batch_size ) return 0 ;
2017-04-08 03:30:30 +08:00
s = ( step_t * ) calloc ( 1 , sizeof ( step_t ) ) ;
2017-07-19 21:26:46 +08:00
s - > seq = mm_bseq_read ( p - > fp , p - > mini_batch_size , 0 , & s - > n_seq ) ; // read a mini-batch
2017-04-08 03:30:30 +08:00
if ( s - > seq ) {
uint32_t old_m , m ;
assert ( ( uint64_t ) p - > mi - > n_seq + s - > n_seq < = UINT32_MAX ) ; // to prevent integer overflow
// make room for p->mi->seq
old_m = p - > mi - > n_seq , m = p - > mi - > n_seq + s - > n_seq ;
kroundup32 ( m ) ; kroundup32 ( old_m ) ;
if ( old_m ! = m )
2017-07-25 07:36:05 +08:00
p - > mi - > seq = ( mm_idx_seq_t * ) krealloc ( p - > mi - > km , p - > mi - > seq , m * sizeof ( mm_idx_seq_t ) ) ;
2017-04-08 03:30:30 +08:00
// make room for p->mi->S
2017-11-12 10:38:38 +08:00
if ( ! ( p - > mi - > flag & MM_I_NO_SEQ ) ) {
uint64_t sum_len , old_max_len , max_len ;
for ( i = 0 , sum_len = 0 ; i < s - > n_seq ; + + i ) sum_len + = s - > seq [ i ] . l_seq ;
old_max_len = ( p - > sum_len + 7 ) / 8 ;
max_len = ( p - > sum_len + sum_len + 7 ) / 8 ;
kroundup64 ( old_max_len ) ; kroundup64 ( max_len ) ;
if ( old_max_len ! = max_len ) {
p - > mi - > S = ( uint32_t * ) realloc ( p - > mi - > S , max_len * 4 ) ;
memset ( & p - > mi - > S [ old_max_len ] , 0 , 4 * ( max_len - old_max_len ) ) ;
}
2017-04-08 03:30:30 +08:00
}
// populate p->mi->seq
for ( i = 0 ; i < s - > n_seq ; + + i ) {
2017-04-10 02:17:37 +08:00
mm_idx_seq_t * seq = & p - > mi - > seq [ p - > mi - > n_seq ] ;
2017-04-08 03:30:30 +08:00
uint32_t j ;
2017-11-12 08:54:06 +08:00
if ( ! ( p - > mi - > flag & MM_I_NO_NAME ) ) {
2017-07-25 07:36:05 +08:00
seq - > name = ( char * ) kmalloc ( p - > mi - > km , strlen ( s - > seq [ i ] . name ) + 1 ) ;
strcpy ( seq - > name , s - > seq [ i ] . name ) ;
2017-04-08 03:30:30 +08:00
} else seq - > name = 0 ;
seq - > len = s - > seq [ i ] . l_seq ;
2017-04-19 23:06:24 +08:00
seq - > offset = p - > sum_len ;
2020-01-21 08:32:31 +08:00
seq - > is_alt = 0 ;
2017-04-08 03:30:30 +08:00
// copy the sequence
2017-11-12 10:38:38 +08:00
if ( ! ( p - > mi - > flag & MM_I_NO_SEQ ) ) {
for ( j = 0 ; j < seq - > len ; + + j ) { // TODO: this is not the fastest way, but let's first see if speed matters here
uint64_t o = p - > sum_len + j ;
int c = seq_nt4_table [ ( uint8_t ) s - > seq [ i ] . seq [ j ] ] ;
mm_seq4_set ( p - > mi - > S , o , c ) ;
}
2017-04-08 03:30:30 +08:00
}
2017-04-19 23:06:24 +08:00
// update p->sum_len and p->mi->n_seq
p - > sum_len + = seq - > len ;
2017-04-08 03:30:30 +08:00
s - > seq [ i ] . rid = p - > mi - > n_seq + + ;
}
return s ;
} else free ( s ) ;
} else if ( step = = 1 ) { // step 1: compute sketch
step_t * s = ( step_t * ) in ;
for ( i = 0 ; i < s - > n_seq ; + + i ) {
2017-07-19 21:26:46 +08:00
mm_bseq1_t * t = & s - > seq [ i ] ;
2017-12-08 10:05:39 +08:00
if ( t - > l_seq > 0 )
mm_sketch ( 0 , t - > seq , t - > l_seq , p - > mi - > w , p - > mi - > k , t - > rid , p - > mi - > flag & MM_I_HPC , & s - > a ) ;
else if ( mm_verbose > = 2 )
fprintf ( stderr , " [WARNING] the length database sequence '%s' is 0 \n " , t - > name ) ;
2017-04-08 03:30:30 +08:00
free ( t - > seq ) ; free ( t - > name ) ;
}
free ( s - > seq ) ; s - > seq = 0 ;
return s ;
} else if ( step = = 2 ) { // dispatch sketch to buckets
step_t * s = ( step_t * ) in ;
mm_idx_add ( p - > mi , s - > a . n , s - > a . a ) ;
2017-09-23 22:43:22 +08:00
kfree ( 0 , s - > a . a ) ; free ( s ) ;
2017-04-08 03:30:30 +08:00
}
return 0 ;
}
2017-11-12 08:54:06 +08:00
mm_idx_t * mm_idx_gen ( mm_bseq_file_t * fp , int w , int k , int b , int flag , int mini_batch_size , int n_threads , uint64_t batch_size )
2017-04-08 03:30:30 +08:00
{
pipeline_t pl ;
2017-08-25 10:35:58 +08:00
if ( fp = = 0 | | mm_bseq_eof ( fp ) ) return 0 ;
2017-04-08 03:30:30 +08:00
memset ( & pl , 0 , sizeof ( pipeline_t ) ) ;
2018-06-20 03:26:58 +08:00
pl . mini_batch_size = ( uint64_t ) mini_batch_size < batch_size ? mini_batch_size : batch_size ;
2017-04-08 03:30:30 +08:00
pl . batch_size = batch_size ;
pl . fp = fp ;
2017-11-12 08:54:06 +08:00
pl . mi = mm_idx_init ( w , k , b , flag ) ;
2017-04-08 03:30:30 +08:00
kt_pipeline ( n_threads < 3 ? n_threads : 3 , worker_pipeline , & pl , 3 ) ;
if ( mm_verbose > = 3 )
fprintf ( stderr , " [M::%s::%.3f*%.2f] collected minimizers \n " , __func__ , realtime ( ) - mm_realtime0 , cputime ( ) / ( realtime ( ) - mm_realtime0 ) ) ;
mm_idx_post ( pl . mi , n_threads ) ;
if ( mm_verbose > = 3 )
fprintf ( stderr , " [M::%s::%.3f*%.2f] sorted minimizers \n " , __func__ , realtime ( ) - mm_realtime0 , cputime ( ) / ( realtime ( ) - mm_realtime0 ) ) ;
return pl . mi ;
}
2017-12-19 11:29:46 +08:00
mm_idx_t * mm_idx_build ( const char * fn , int w , int k , int flag , int n_threads ) / / a simpler interface ; deprecated
2017-04-08 03:30:30 +08:00
{
2017-07-19 21:26:46 +08:00
mm_bseq_file_t * fp ;
2017-04-08 03:30:30 +08:00
mm_idx_t * mi ;
2017-07-19 21:26:46 +08:00
fp = mm_bseq_open ( fn ) ;
2017-04-08 03:30:30 +08:00
if ( fp = = 0 ) return 0 ;
2017-11-12 08:54:06 +08:00
mi = mm_idx_gen ( fp , w , k , 14 , flag , 1 < < 18 , n_threads , UINT64_MAX ) ;
2017-07-19 21:26:46 +08:00
mm_bseq_close ( fp ) ;
2017-04-08 03:30:30 +08:00
return mi ;
}
2017-12-19 11:29:46 +08:00
mm_idx_t * mm_idx_str ( int w , int k , int is_hpc , int bucket_bits , int n , const char * * seq , const char * * name )
{
uint64_t sum_len = 0 ;
mm128_v a = { 0 , 0 , 0 } ;
mm_idx_t * mi ;
2018-08-06 08:57:05 +08:00
khash_t ( str ) * h ;
2017-12-19 11:29:46 +08:00
int i , flag = 0 ;
2018-08-06 08:57:05 +08:00
2017-12-19 11:29:46 +08:00
if ( n < = 0 ) return 0 ;
for ( i = 0 ; i < n ; + + i ) // get the total length
sum_len + = strlen ( seq [ i ] ) ;
if ( is_hpc ) flag | = MM_I_HPC ;
if ( name = = 0 ) flag | = MM_I_NO_NAME ;
if ( bucket_bits < 0 ) bucket_bits = 14 ;
mi = mm_idx_init ( w , k , bucket_bits , flag ) ;
mi - > n_seq = n ;
mi - > seq = ( mm_idx_seq_t * ) kcalloc ( mi - > km , n , sizeof ( mm_idx_seq_t ) ) ; // ->seq is allocated from km
mi - > S = ( uint32_t * ) calloc ( ( sum_len + 7 ) / 8 , 4 ) ;
2018-08-06 08:57:05 +08:00
mi - > h = h = kh_init ( str ) ;
2017-12-19 11:29:46 +08:00
for ( i = 0 , sum_len = 0 ; i < n ; + + i ) {
const char * s = seq [ i ] ;
mm_idx_seq_t * p = & mi - > seq [ i ] ;
uint32_t j ;
if ( name & & name [ i ] ) {
2018-08-06 08:57:05 +08:00
int absent ;
2017-12-19 11:29:46 +08:00
p - > name = ( char * ) kmalloc ( mi - > km , strlen ( name [ i ] ) + 1 ) ;
strcpy ( p - > name , name [ i ] ) ;
2018-08-06 08:57:05 +08:00
kh_put ( str , h , p - > name , & absent ) ;
assert ( absent ) ;
2017-12-19 11:29:46 +08:00
}
p - > offset = sum_len ;
p - > len = strlen ( s ) ;
2020-01-21 08:32:31 +08:00
p - > is_alt = 0 ;
2017-12-19 11:29:46 +08:00
for ( j = 0 ; j < p - > len ; + + j ) {
int c = seq_nt4_table [ ( uint8_t ) s [ j ] ] ;
uint64_t o = sum_len + j ;
mm_seq4_set ( mi - > S , o , c ) ;
}
sum_len + = p - > len ;
if ( p - > len > 0 ) {
a . n = 0 ;
mm_sketch ( 0 , s , p - > len , w , k , i , is_hpc , & a ) ;
mm_idx_add ( mi , a . n , a . a ) ;
}
}
free ( a . a ) ;
mm_idx_post ( mi , 1 ) ;
return mi ;
}
2017-04-08 03:30:30 +08:00
/*************
* index I / O *
* * * * * * * * * * * * */
void mm_idx_dump ( FILE * fp , const mm_idx_t * mi )
{
2017-04-19 23:06:24 +08:00
uint64_t sum_len = 0 ;
2018-06-20 03:26:58 +08:00
uint32_t x [ 5 ] , i ;
2017-04-19 23:06:24 +08:00
2017-11-12 08:54:06 +08:00
x [ 0 ] = mi - > w , x [ 1 ] = mi - > k , x [ 2 ] = mi - > b , x [ 3 ] = mi - > n_seq , x [ 4 ] = mi - > flag ;
2017-04-08 03:30:30 +08:00
fwrite ( MM_IDX_MAGIC , 1 , 4 , fp ) ;
2017-04-08 03:42:33 +08:00
fwrite ( x , 4 , 5 , fp ) ;
2017-04-08 03:30:30 +08:00
for ( i = 0 ; i < mi - > n_seq ; + + i ) {
2018-06-07 05:04:59 +08:00
if ( mi - > seq [ i ] . name ) {
uint8_t l = strlen ( mi - > seq [ i ] . name ) ;
fwrite ( & l , 1 , 1 , fp ) ;
fwrite ( mi - > seq [ i ] . name , 1 , l , fp ) ;
} else {
uint8_t l = 0 ;
fwrite ( & l , 1 , 1 , fp ) ;
}
2017-04-08 03:30:30 +08:00
fwrite ( & mi - > seq [ i ] . len , 4 , 1 , fp ) ;
2017-04-19 23:06:24 +08:00
sum_len + = mi - > seq [ i ] . len ;
2017-04-08 03:30:30 +08:00
}
for ( i = 0 ; i < 1 < < mi - > b ; + + i ) {
mm_idx_bucket_t * b = & mi - > B [ i ] ;
khint_t k ;
idxhash_t * h = ( idxhash_t * ) b - > h ;
uint32_t size = h ? h - > size : 0 ;
fwrite ( & b - > n , 4 , 1 , fp ) ;
fwrite ( b - > p , 8 , b - > n , fp ) ;
fwrite ( & size , 4 , 1 , fp ) ;
if ( size = = 0 ) continue ;
for ( k = 0 ; k < kh_end ( h ) ; + + k ) {
uint64_t x [ 2 ] ;
if ( ! kh_exist ( h , k ) ) continue ;
x [ 0 ] = kh_key ( h , k ) , x [ 1 ] = kh_val ( h , k ) ;
fwrite ( x , 8 , 2 , fp ) ;
}
}
2017-11-12 10:38:38 +08:00
if ( ! ( mi - > flag & MM_I_NO_SEQ ) )
fwrite ( mi - > S , 4 , ( sum_len + 7 ) / 8 , fp ) ;
2017-04-19 23:06:24 +08:00
fflush ( fp ) ;
2017-04-08 03:30:30 +08:00
}
mm_idx_t * mm_idx_load ( FILE * fp )
{
char magic [ 4 ] ;
2018-06-20 03:26:58 +08:00
uint32_t x [ 5 ] , i ;
2017-04-19 23:06:24 +08:00
uint64_t sum_len = 0 ;
2017-04-08 03:30:30 +08:00
mm_idx_t * mi ;
2017-04-19 23:06:24 +08:00
2017-04-08 03:30:30 +08:00
if ( fread ( magic , 1 , 4 , fp ) ! = 4 ) return 0 ;
if ( strncmp ( magic , MM_IDX_MAGIC , 4 ) ! = 0 ) return 0 ;
2017-04-26 19:36:46 +08:00
if ( fread ( x , 4 , 5 , fp ) ! = 5 ) return 0 ;
2017-04-08 03:42:33 +08:00
mi = mm_idx_init ( x [ 0 ] , x [ 1 ] , x [ 2 ] , x [ 4 ] ) ;
2017-04-08 03:30:30 +08:00
mi - > n_seq = x [ 3 ] ;
2017-07-25 07:36:05 +08:00
mi - > seq = ( mm_idx_seq_t * ) kcalloc ( mi - > km , mi - > n_seq , sizeof ( mm_idx_seq_t ) ) ;
2017-04-08 03:30:30 +08:00
for ( i = 0 ; i < mi - > n_seq ; + + i ) {
uint8_t l ;
2017-04-19 23:06:24 +08:00
mm_idx_seq_t * s = & mi - > seq [ i ] ;
2017-04-08 03:30:30 +08:00
fread ( & l , 1 , 1 , fp ) ;
2018-06-07 05:04:59 +08:00
if ( l ) {
s - > name = ( char * ) kmalloc ( mi - > km , l + 1 ) ;
fread ( s - > name , 1 , l , fp ) ;
s - > name [ l ] = 0 ;
}
2017-04-19 23:06:24 +08:00
fread ( & s - > len , 4 , 1 , fp ) ;
2017-06-27 01:56:25 +08:00
s - > offset = sum_len ;
2020-01-21 08:32:31 +08:00
s - > is_alt = 0 ;
2017-04-19 23:06:24 +08:00
sum_len + = s - > len ;
2017-04-08 03:30:30 +08:00
}
for ( i = 0 ; i < 1 < < mi - > b ; + + i ) {
mm_idx_bucket_t * b = & mi - > B [ i ] ;
uint32_t j , size ;
khint_t k ;
idxhash_t * h ;
fread ( & b - > n , 4 , 1 , fp ) ;
b - > p = ( uint64_t * ) malloc ( b - > n * 8 ) ;
fread ( b - > p , 8 , b - > n , fp ) ;
fread ( & size , 4 , 1 , fp ) ;
if ( size = = 0 ) continue ;
b - > h = h = kh_init ( idx ) ;
kh_resize ( idx , h , size ) ;
for ( j = 0 ; j < size ; + + j ) {
uint64_t x [ 2 ] ;
int absent ;
fread ( x , 8 , 2 , fp ) ;
k = kh_put ( idx , h , x [ 0 ] , & absent ) ;
assert ( absent ) ;
kh_val ( h , k ) = x [ 1 ] ;
}
}
2017-11-12 10:38:38 +08:00
if ( ! ( mi - > flag & MM_I_NO_SEQ ) ) {
mi - > S = ( uint32_t * ) malloc ( ( sum_len + 7 ) / 8 * 4 ) ;
fread ( mi - > S , 4 , ( sum_len + 7 ) / 8 , fp ) ;
}
2017-04-08 03:30:30 +08:00
return mi ;
}
2017-07-19 22:25:11 +08:00
2017-10-05 05:32:58 +08:00
int64_t mm_idx_is_idx ( const char * fn )
2017-07-19 22:25:11 +08:00
{
int fd , is_idx = 0 ;
2018-07-17 17:17:44 +08:00
int64_t ret , off_end ;
2017-07-19 22:25:11 +08:00
char magic [ 4 ] ;
if ( strcmp ( fn , " - " ) = = 0 ) return 0 ; // read from pipe; not an index
fd = open ( fn , O_RDONLY ) ;
if ( fd < 0 ) return - 1 ; // error
2018-07-17 17:17:44 +08:00
# ifdef WIN32
if ( ( off_end = _lseeki64 ( fd , 0 , SEEK_END ) ) > = 4 ) {
_lseeki64 ( fd , 0 , SEEK_SET ) ;
# else
2017-10-05 05:32:58 +08:00
if ( ( off_end = lseek ( fd , 0 , SEEK_END ) ) > = 4 ) {
2017-07-19 22:25:11 +08:00
lseek ( fd , 0 , SEEK_SET ) ;
2018-07-17 17:17:44 +08:00
# endif // WIN32
2017-07-19 22:25:11 +08:00
ret = read ( fd , magic , 4 ) ;
if ( ret = = 4 & & strncmp ( magic , MM_IDX_MAGIC , 4 ) = = 0 )
is_idx = 1 ;
}
close ( fd ) ;
2017-10-05 05:32:58 +08:00
return is_idx ? off_end : 0 ;
2017-07-19 22:25:11 +08:00
}
2017-09-15 05:02:01 +08:00
2017-09-15 09:18:13 +08:00
mm_idx_reader_t * mm_idx_reader_open ( const char * fn , const mm_idxopt_t * opt , const char * fn_out )
2017-09-15 05:02:01 +08:00
{
2017-10-05 05:32:58 +08:00
int64_t is_idx ;
2017-09-15 05:02:01 +08:00
mm_idx_reader_t * r ;
is_idx = mm_idx_is_idx ( fn ) ;
if ( is_idx < 0 ) return 0 ; // failed to open the index
r = ( mm_idx_reader_t * ) calloc ( 1 , sizeof ( mm_idx_reader_t ) ) ;
r - > is_idx = is_idx ;
2017-09-15 23:29:49 +08:00
if ( opt ) r - > opt = * opt ;
else mm_idxopt_init ( & r - > opt ) ;
2017-10-05 05:32:58 +08:00
if ( r - > is_idx ) {
r - > fp . idx = fopen ( fn , " rb " ) ;
r - > idx_size = is_idx ;
} else r - > fp . seq = mm_bseq_open ( fn ) ;
2017-09-15 09:18:13 +08:00
if ( fn_out ) r - > fp_out = fopen ( fn_out , " wb " ) ;
2017-09-15 05:02:01 +08:00
return r ;
}
void mm_idx_reader_close ( mm_idx_reader_t * r )
{
if ( r - > is_idx ) fclose ( r - > fp . idx ) ;
else mm_bseq_close ( r - > fp . seq ) ;
2017-09-15 09:18:13 +08:00
if ( r - > fp_out ) fclose ( r - > fp_out ) ;
free ( r ) ;
2017-09-15 05:02:01 +08:00
}
mm_idx_t * mm_idx_reader_read ( mm_idx_reader_t * r , int n_threads )
{
mm_idx_t * mi ;
if ( r - > is_idx ) {
mi = mm_idx_load ( r - > fp . idx ) ;
2017-11-12 08:54:06 +08:00
if ( mi & & mm_verbose > = 2 & & ( mi - > k ! = r - > opt . k | | mi - > w ! = r - > opt . w | | ( mi - > flag & MM_I_HPC ) ! = ( r - > opt . flag & MM_I_HPC ) ) )
2017-10-07 01:02:25 +08:00
fprintf ( stderr , " [WARNING] \033 [1;31m Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index. \033 [0m \n " ) ;
2017-09-15 05:02:01 +08:00
} else
2017-11-12 08:54:06 +08:00
mi = mm_idx_gen ( r - > fp . seq , r - > opt . w , r - > opt . k , r - > opt . bucket_bits , r - > opt . flag , r - > opt . mini_batch_size , n_threads , r - > opt . batch_size ) ;
2017-09-15 09:18:13 +08:00
if ( mi ) {
if ( r - > fp_out ) mm_idx_dump ( r - > fp_out , mi ) ;
2018-07-15 00:15:10 +08:00
mi - > index = r - > n_parts + + ;
2017-09-15 09:18:13 +08:00
}
2017-09-15 05:02:01 +08:00
return mi ;
}
2017-10-05 05:32:58 +08:00
int mm_idx_reader_eof ( const mm_idx_reader_t * r ) // TODO: in extremely rare cases, mm_bseq_eof() might not work
{
return r - > is_idx ? ( feof ( r - > fp . idx ) | | ftell ( r - > fp . idx ) = = r - > idx_size ) : mm_bseq_eof ( r - > fp . seq ) ;
}
2019-04-28 09:50:02 +08:00
# include <ctype.h>
# include <zlib.h>
# include "ksort.h"
# include "kseq.h"
KSTREAM_DECLARE ( gzFile , gzread )
2020-01-21 08:32:31 +08:00
int mm_idx_alt_read ( mm_idx_t * mi , const char * fn )
{
int n_alt = 0 ;
gzFile fp ;
kstream_t * ks ;
kstring_t str = { 0 , 0 , 0 } ;
fp = fn & & strcmp ( fn , " - " ) ? gzopen ( fn , " r " ) : gzdopen ( fileno ( stdin ) , " r " ) ;
if ( fp = = 0 ) return - 1 ;
ks = ks_init ( fp ) ;
if ( mi - > h = = 0 ) mm_idx_index_name ( mi ) ;
while ( ks_getuntil ( ks , KS_SEP_LINE , & str , 0 ) > = 0 ) {
char * p ;
int id ;
for ( p = str . s ; * p & & ! isspace ( * p ) ; + + p ) { }
* p = 0 ;
id = mm_idx_name2id ( mi , str . s ) ;
if ( id > = 0 ) mi - > seq [ id ] . is_alt = 1 , + + n_alt ;
}
mi - > n_alt = n_alt ;
if ( mm_verbose > = 3 )
fprintf ( stderr , " [M::%s] found %d ALT contigs \n " , __func__ , n_alt ) ;
return n_alt ;
}
2019-04-28 09:50:02 +08:00
# define sort_key_bed(a) ((a).st)
KRADIX_SORT_INIT ( bed , mm_idx_intv1_t , sort_key_bed , 4 )
2019-04-29 08:12:28 +08:00
mm_idx_intv_t * mm_idx_read_bed ( const mm_idx_t * mi , const char * fn , int read_junc )
2019-04-28 09:50:02 +08:00
{
gzFile fp ;
kstream_t * ks ;
kstring_t str = { 0 , 0 , 0 } ;
mm_idx_intv_t * I ;
fp = fn & & strcmp ( fn , " - " ) ? gzopen ( fn , " r " ) : gzdopen ( fileno ( stdin ) , " r " ) ;
if ( fp = = 0 ) return 0 ;
I = ( mm_idx_intv_t * ) calloc ( mi - > n_seq , sizeof ( * I ) ) ;
ks = ks_init ( fp ) ;
while ( ks_getuntil ( ks , KS_SEP_LINE , & str , 0 ) > = 0 ) {
mm_idx_intv_t * r ;
mm_idx_intv1_t t = { - 1 , - 1 , - 1 , - 1 , 0 } ;
2019-04-29 08:12:28 +08:00
char * p , * q , * bl , * bs ;
int32_t i , id = - 1 , n_blk = 0 ;
2019-04-28 09:50:02 +08:00
for ( p = q = str . s , i = 0 ; ; + + p ) {
if ( * p = = 0 | | isspace ( * p ) ) {
int32_t c = * p ;
* p = 0 ;
if ( i = = 0 ) { // chr
id = mm_idx_name2id ( mi , q ) ;
if ( id < 0 ) break ; // unknown name; TODO: throw a warning
} else if ( i = = 1 ) { // start
t . st = atol ( q ) ; // TODO: watch out integer overflow!
if ( t . st < 0 ) break ;
} else if ( i = = 2 ) { // end
t . en = atol ( q ) ;
if ( t . en < 0 ) break ;
} else if ( i = = 4 ) { // BED score
t . score = atol ( q ) ;
} else if ( i = = 5 ) { // strand
t . strand = * q = = ' + ' ? 1 : * q = = ' - ' ? - 1 : 0 ;
2019-04-29 08:12:28 +08:00
} else if ( i = = 9 ) {
if ( ! isdigit ( * q ) ) break ;
n_blk = atol ( q ) ;
} else if ( i = = 10 ) {
bl = q ;
} else if ( i = = 11 ) {
bs = q ;
break ;
}
2019-04-28 09:50:02 +08:00
if ( c = = 0 ) break ;
+ + i , q = p + 1 ;
}
}
if ( id < 0 | | t . st < 0 | | t . st > = t . en ) continue ;
r = & I [ id ] ;
2019-04-29 08:12:28 +08:00
if ( i > = 11 & & read_junc ) { // BED12
int32_t st , sz , en ;
st = strtol ( bs , & bs , 10 ) ; + + bs ;
sz = strtol ( bl , & bl , 10 ) ; + + bl ;
en = t . st + st + sz ;
for ( i = 1 ; i < n_blk ; + + i ) {
mm_idx_intv1_t s = t ;
if ( r - > n = = r - > m ) {
r - > m = r - > m ? r - > m + ( r - > m > > 1 ) : 16 ;
r - > a = ( mm_idx_intv1_t * ) realloc ( r - > a , sizeof ( * r - > a ) * r - > m ) ;
}
st = strtol ( bs , & bs , 10 ) ; + + bs ;
sz = strtol ( bl , & bl , 10 ) ; + + bl ;
s . st = en , s . en = t . st + st ;
en = t . st + st + sz ;
if ( s . en > s . st ) r - > a [ r - > n + + ] = s ;
}
} else {
if ( r - > n = = r - > m ) {
r - > m = r - > m ? r - > m + ( r - > m > > 1 ) : 16 ;
r - > a = ( mm_idx_intv1_t * ) realloc ( r - > a , sizeof ( * r - > a ) * r - > m ) ;
}
r - > a [ r - > n + + ] = t ;
2019-04-28 09:50:02 +08:00
}
}
2019-04-29 02:52:47 +08:00
free ( str . s ) ;
2019-04-28 09:50:02 +08:00
ks_destroy ( ks ) ;
gzclose ( fp ) ;
return I ;
}
2019-04-29 08:12:28 +08:00
int mm_idx_bed_read ( mm_idx_t * mi , const char * fn , int read_junc )
2019-04-28 09:50:02 +08:00
{
int32_t i ;
if ( mi - > h = = 0 ) mm_idx_index_name ( mi ) ;
2019-04-29 08:12:28 +08:00
mi - > I = mm_idx_read_bed ( mi , fn , read_junc ) ;
2019-04-28 09:50:02 +08:00
if ( mi - > I = = 0 ) return - 1 ;
2019-04-29 02:52:47 +08:00
for ( i = 0 ; i < mi - > n_seq ; + + i ) // TODO: eliminate redundant intervals
2019-04-28 09:50:02 +08:00
radix_sort_bed ( mi - > I [ i ] . a , mi - > I [ i ] . a + mi - > I [ i ] . n ) ;
2019-04-29 02:52:47 +08:00
return 0 ;
2019-04-28 09:50:02 +08:00
}
2019-04-28 10:39:26 +08:00
int mm_idx_bed_junc ( const mm_idx_t * mi , int32_t ctg , int32_t st , int32_t en , uint8_t * s )
2019-04-28 09:50:02 +08:00
{
int32_t i , left , right ;
mm_idx_intv_t * r ;
memset ( s , 0 , en - st ) ;
if ( mi - > I = = 0 | | ctg < 0 | | ctg > = mi - > n_seq ) return - 1 ;
r = & mi - > I [ ctg ] ;
left = 0 , right = r - > n ;
while ( right > left ) {
int32_t mid = left + ( ( right - left ) > > 1 ) ;
if ( r - > a [ mid ] . st > = st ) right = mid ;
else left = mid + 1 ;
}
for ( i = left ; i < r - > n ; + + i ) {
2019-04-28 10:39:26 +08:00
if ( st < = r - > a [ i ] . st & & en > = r - > a [ i ] . en & & r - > a [ i ] . strand ! = 0 ) {
2019-04-28 09:50:02 +08:00
if ( r - > a [ i ] . strand > 0 ) {
2019-04-29 04:44:30 +08:00
s [ r - > a [ i ] . st - st ] | = 1 , s [ r - > a [ i ] . en - 1 - st ] | = 2 ;
2019-04-28 09:50:02 +08:00
} else {
2019-04-29 04:44:30 +08:00
s [ r - > a [ i ] . st - st ] | = 8 , s [ r - > a [ i ] . en - 1 - st ] | = 4 ;
2019-04-28 09:50:02 +08:00
}
}
}
return left ;
}