diff --git a/index.c b/index.c index 22fb063..40d111a 100644 --- a/index.c +++ b/index.c @@ -21,6 +21,13 @@ typedef khash_t(idx) idxhash_t; #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x)) +typedef struct mm_idx_bucket_s { + mm128_v a; // (minimizer, position) array + int32_t n; // size of the _p_ array + uint64_t *p; // position array for minimizers appearing >1 times + void *h; // hash table indexing _p_ and minimizers appearing once +} mm_idx_bucket_t; + void mm_idxopt_init(mm_idxopt_t *opt) { memset(opt, 0, sizeof(mm_idxopt_t)); @@ -439,7 +446,7 @@ int mm_idx_is_idx(const char *fn) return is_idx; } -mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt) +mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out) { int is_idx; mm_idx_reader_t *r; @@ -450,6 +457,7 @@ mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt) r->opt = *opt; if (r->is_idx) r->fp.idx = fopen(fn, "rb"); else r->fp.seq = mm_bseq_open(fn); + if (fn_out) r->fp_out = fopen(fn_out, "wb"); return r; } @@ -457,6 +465,8 @@ void mm_idx_reader_close(mm_idx_reader_t *r) { if (r->is_idx) fclose(r->fp.idx); else mm_bseq_close(r->fp.seq); + if (r->fp_out) fclose(r->fp_out); + free(r); } mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) @@ -468,6 +478,9 @@ mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) fprintf(stderr, "[WARNING] Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\n"); } else mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1); - if (mi) ++r->n_parts; + if (mi) { + if (r->fp_out) mm_idx_dump(r->fp_out, mi); + ++r->n_parts; + } return mi; } diff --git a/main.c b/main.c index e5bd5ad..07c987a 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1.1-r365-dirty" +#define MM_VERSION "2.1.1-r367-dirty" #ifdef __linux__ #include @@ -65,13 +65,12 @@ int main(int argc, char *argv[]) mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx, max_intron_len = 0; char *fnw = 0, *rg = 0, *s; - FILE *fpw = 0, *fp_help = stderr; + FILE *fp_help = stderr; mm_idx_reader_t *idx_rdr; liftrlimit(); mm_realtime0 = realtime(); - mm_mapopt_init(&opt); - mm_idxopt_init(&ipt); + mm_set_opt(0, &ipt, &opt); while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) { if (c == 'w') ipt.w = atoi(optarg); @@ -139,7 +138,7 @@ int main(int argc, char *argv[]) opt.e = opt.e2 = strtol(optarg, &s, 10); if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); } else if (c == 'x') { - if (mm_preset(optarg, &ipt, &opt) < 0) { + if (mm_set_opt(optarg, &ipt, &opt) < 0) { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); return 1; } @@ -200,7 +199,7 @@ int main(int argc, char *argv[]) return fp_help == stdout? 0 : 1; } - idx_rdr = mm_idx_reader_open(argv[optind], &ipt); + idx_rdr = mm_idx_reader_open(argv[optind], &ipt, fnw); if (idx_rdr == 0) { fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]); return 1; @@ -209,7 +208,6 @@ int main(int argc, char *argv[]) fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); return 1; } - if (fnw) fpw = fopen(fnw, "wb"); if (opt.flag & MM_F_OUT_SAM) mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv); for (;;) { @@ -220,18 +218,13 @@ int main(int argc, char *argv[]) if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); - if (fpw) { - mm_idx_dump(fpw, mi); - if (mm_verbose >= 3) - fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the current part of the index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); - } if (argc != optind + 1) mm_mapopt_update(&opt, mi); if (mm_verbose >= 3) mm_idx_stat(mi); for (i = optind + 1; i < argc; ++i) mm_map_file(mi, argv[i], &opt, n_threads); mm_idx_destroy(mi); } - if (fpw) fclose(fpw); + mm_idx_reader_close(idx_rdr); fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); fprintf(stderr, "[M::%s] CMD:", __func__); diff --git a/map.c b/map.c index 46e820b..00d88cc 100644 --- a/map.c +++ b/map.c @@ -11,7 +11,6 @@ void mm_mapopt_init(mm_mapopt_t *opt) { memset(opt, 0, sizeof(mm_mapopt_t)); - opt->max_occ_frac = 1e-5f; opt->mid_occ_frac = 2e-4f; opt->sdust_thres = 0; @@ -41,18 +40,18 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) { if (opt->flag & MM_F_SPLICE_BOTH) opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); - if (opt->max_occ <= 0) - opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac); if (opt->mid_occ <= 0) opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); if (mm_verbose >= 3) - fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d; max_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), - opt->mid_occ, opt->max_occ); + fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } -int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) +int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) { - if (strcmp(preset, "ava-ont") == 0) { + if (preset == 0) { + mm_idxopt_init(io); + mm_mapopt_init(mo); + } else if (strcmp(preset, "ava-ont") == 0) { io->is_hpc = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_AVA | MM_F_NO_SELF; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; diff --git a/minimap.h b/minimap.h index c825c69..7295902 100644 --- a/minimap.h +++ b/minimap.h @@ -25,21 +25,16 @@ extern "C" { #endif -typedef struct { - uint64_t x, y; -} mm128_t; +// hidden structures +struct mm_idx_bucket_s; +struct mm_bseq_file_s; +struct mm_tbuf_s; +// emulate 128-bit integers and arrays +typedef struct { uint64_t x, y; } mm128_t; typedef struct { size_t n, m; mm128_t *a; } mm128_v; -typedef struct { size_t n, m; uint64_t *a; } uint64_v; -typedef struct { size_t n, m; uint32_t *a; } uint32_v; - -typedef struct { - mm128_v a; // (minimizer, position) array - int32_t n; // size of the _p_ array - uint64_t *p; // position array for minimizers appearing >1 times - void *h; // hash table indexing _p_ and minimizers appearing once -} mm_idx_bucket_t; +// minimap2 index typedef struct { char *name; // name of the db sequence uint64_t offset; // offset in mm_idx_t::S @@ -48,43 +43,50 @@ typedef struct { typedef struct { int32_t b, w, k, is_hpc; - uint32_t n_seq; // number of reference sequences - mm_idx_seq_t *seq; // sequence name, length and offset - uint32_t *S; // 4-bit packed sequence - mm_idx_bucket_t *B; // index + uint32_t n_seq; // number of reference sequences + mm_idx_seq_t *seq; // sequence name, length and offset + uint32_t *S; // 4-bit packed sequence + struct mm_idx_bucket_s *B; // index (hidden) void *km; } mm_idx_t; +// minimap2 alignment typedef struct { - uint32_t capacity; - int32_t dp_score, dp_max, dp_max2; - uint32_t blen; - uint32_t n_diff; - uint32_t n_ambi:30, trans_strand:2; - uint32_t n_cigar; + uint32_t capacity; // the capacity of cigar[] + int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings + uint32_t blen; // block length + uint32_t n_diff; // number of differences, including ambiguous bases + uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for - + uint32_t n_cigar; // number of cigar operations in cigar[] uint32_t cigar[]; } mm_extra_t; typedef struct { - int32_t id; - uint32_t cnt:31, rev:1; - uint32_t rid:31, inv:1; - int32_t score; - int32_t qs, qe, rs, re; - int32_t parent, subsc; - int32_t as; - int32_t fuzzy_mlen, fuzzy_blen; - uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // TODO: n_sub is not used for now + int32_t id; // ID for internal uses (see also parent below) + uint32_t cnt:31, rev:1; // number of minimizers; if on the reverse strand + uint32_t rid:31, inv:1; // reference index; if this is an alignment from inversion rescue + int32_t score; // DP alignment score + int32_t qs, qe, rs, re; // query start and end; reference start and end + int32_t parent, subsc; // parent==id if primary; best alternate mapping score + int32_t as; // offset in the a[] array (for internal uses only) + int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate) + uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings mm_extra_t *p; } mm_reg1_t; +// indexing and mapping options typedef struct { - float max_occ_frac; - float mid_occ_frac; - int sdust_thres; // score threshold for SDUST; 0 to disable - int flag; // see MM_F_* macros + short k, w, is_hpc, bucket_bits; + int mini_batch_size; + uint64_t batch_size; +} mm_idxopt_t; - int bw; // bandwidth +typedef struct { + float mid_occ_frac; + int sdust_thres; // score threshold for SDUST; 0 to disable + int flag; // see MM_F_* macros + + int bw; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_chain_skip; int min_cnt; @@ -104,63 +106,44 @@ typedef struct { int min_ksw_len; int mini_batch_size; - int32_t max_occ, mid_occ; + int32_t mid_occ; } mm_mapopt_t; -typedef struct { - short k, w, is_hpc, bucket_bits; - int mini_batch_size; - uint64_t batch_size; -} mm_idxopt_t; - -struct mm_bseq_file_s; - +// index reader typedef struct { int is_idx, n_parts; mm_idxopt_t opt; + FILE *fp_out; union { struct mm_bseq_file_s *seq; FILE *idx; } fp; } mm_idx_reader_t; -extern int mm_verbose, mm_dbg_flag; -extern double mm_realtime0; - -struct mm_tbuf_s; +// memory buffer for thread-local storage during mapping typedef struct mm_tbuf_s mm_tbuf_t; -#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2)) -#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf) +// global variables +extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag +extern double mm_realtime0; // wall-clock timer -void mm_idxopt_init(mm_idxopt_t *opt); -void mm_mapopt_init(mm_mapopt_t *opt); -int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo); +int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo); void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi); -mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt); -int mm_idx_reader_is_idx(const mm_idx_reader_t *r); +mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out); mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads); void mm_idx_reader_close(mm_idx_reader_t *r); +void mm_idx_stat(const mm_idx_t *idx); +int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); void mm_idx_destroy(mm_idx_t *mi); -// minimizer indexing -void mm_idx_stat(const mm_idx_t *idx); -const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); -int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); - -// minimizer index I/O -void mm_idx_dump(FILE *fp, const mm_idx_t *mi); -mm_idx_t *mm_idx_load(FILE *fp); - -// mapping mm_tbuf_t *mm_tbuf_init(void); void mm_tbuf_destroy(mm_tbuf_t *b); mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); - int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads); -// obsolete APIs (for backward compatibility) +// deprecated APIs for backward compatibility +void mm_mapopt_init(mm_mapopt_t *opt); mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads); #ifdef __cplusplus diff --git a/mmpriv.h b/mmpriv.h index 29141e2..c650e28 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -21,6 +21,9 @@ #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif +#define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2)) +#define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf) + #ifdef __cplusplus extern "C" { #endif @@ -47,6 +50,8 @@ void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *arg void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); +void mm_idxopt_init(mm_idxopt_t *opt); +const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);