diff --git a/index.c b/index.c index 70f82d6..22fb063 100644 --- a/index.c +++ b/index.c @@ -21,6 +21,15 @@ typedef khash_t(idx) idxhash_t; #define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x)) +void mm_idxopt_init(mm_idxopt_t *opt) +{ + memset(opt, 0, sizeof(mm_idxopt_t)); + opt->k = 15, opt->w = 10, opt->is_hpc = 0; + opt->bucket_bits = 14; + opt->mini_batch_size = 50000000; + opt->batch_size = 4000000000ULL; +} + mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc) { mm_idx_t *mi; @@ -314,7 +323,7 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) mm_idx_t *mi; fp = mm_bseq_open(fn); if (fp == 0) return 0; - mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, is_hpc, 1<<18, n_threads, UINT64_MAX, 1); + mi = mm_idx_gen(fp, w, k, 14, is_hpc, 1<<18, n_threads, UINT64_MAX, 1); mm_bseq_close(fp); return mi; } @@ -429,3 +438,36 @@ int mm_idx_is_idx(const char *fn) close(fd); return is_idx; } + +mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt) +{ + int is_idx; + mm_idx_reader_t *r; + is_idx = mm_idx_is_idx(fn); + if (is_idx < 0) return 0; // failed to open the index + r = (mm_idx_reader_t*)calloc(1, sizeof(mm_idx_reader_t)); + r->is_idx = is_idx; + r->opt = *opt; + if (r->is_idx) r->fp.idx = fopen(fn, "rb"); + else r->fp.seq = mm_bseq_open(fn); + return r; +} + +void mm_idx_reader_close(mm_idx_reader_t *r) +{ + if (r->is_idx) fclose(r->fp.idx); + else mm_bseq_close(r->fp.seq); +} + +mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads) +{ + mm_idx_t *mi; + if (r->is_idx) { + mi = mm_idx_load(r->fp.idx); + if (mi && mm_verbose >= 2 && (mi->k != r->opt.k || mi->w != r->opt.w || mi->is_hpc != r->opt.is_hpc)) + fprintf(stderr, "[WARNING] Indexing parameters (-k, -w or -H) overridden by parameters used in the prebuilt index.\n"); + } else + mi = mm_idx_gen(r->fp.seq, r->opt.w, r->opt.k, r->opt.bucket_bits, r->opt.is_hpc, r->opt.mini_batch_size, n_threads, r->opt.batch_size, 1); + if (mi) ++r->n_parts; + return mi; +} diff --git a/main.c b/main.c index 57fb5b5..e5bd5ad 100644 --- a/main.c +++ b/main.c @@ -25,7 +25,7 @@ void liftrlimit() {} static struct option long_options[] = { { "bucket-bits", required_argument, 0, 0 }, { "mb-size", required_argument, 0, 'K' }, - { "int-rname", no_argument, 0, 0 }, + { "int-rname", no_argument, 0, 0 }, // obsolete; kept as a placeholder { "no-kalloc", no_argument, 0, 0 }, { "print-qname", no_argument, 0, 0 }, { "no-self", no_argument, 0, 0 }, @@ -62,21 +62,21 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0; - int minibatch_size = 200000000; - uint64_t batch_size = 4000000000ULL; - mm_bseq_file_t *fp = 0; + mm_idxopt_t ipt; + int i, c, n_threads = 3, long_idx, max_intron_len = 0; char *fnw = 0, *rg = 0, *s; - FILE *fpr = 0, *fpw = 0, *fp_help = stderr; + FILE *fpw = 0, *fp_help = stderr; + mm_idx_reader_t *idx_rdr; liftrlimit(); mm_realtime0 = realtime(); mm_mapopt_init(&opt); + mm_idxopt_init(&ipt); while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) { - if (c == 'w') w = atoi(optarg), idx_par_set = 1; - else if (c == 'k') k = atoi(optarg), idx_par_set = 1; - else if (c == 'H') is_hpc = 1, idx_par_set = 1; + if (c == 'w') ipt.w = atoi(optarg); + else if (c == 'k') ipt.k = atoi(optarg); + else if (c == 'H') ipt.is_hpc = 1; else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I else if (c == 'r') opt.bw = (int)mm_parse_num(optarg); else if (c == 't') n_threads = atoi(optarg); @@ -98,12 +98,11 @@ int main(int argc, char *argv[]) else if (c == 'B') opt.b = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); - else if (c == 'I') batch_size = mm_parse_num(optarg); - else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); + else if (c == 'I') ipt.batch_size = mm_parse_num(optarg); + else if (c == 'K') ipt.mini_batch_size = (int)mm_parse_num(optarg); else if (c == 'R') rg = optarg; else if (c == 'h') fp_help = stdout; - else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits - else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname + else if (c == 0 && long_idx == 0) ipt.bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self @@ -140,55 +139,12 @@ int main(int argc, char *argv[]) opt.e = opt.e2 = strtol(optarg, &s, 10); if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); } else if (c == 'x') { - if (strcmp(optarg, "ava-ont") == 0) { - opt.flag |= MM_F_AVA | MM_F_NO_SELF; - opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.max_gap = 10000, opt.max_chain_skip = 25; - minibatch_size = 500000000; - k = 15, w = 5; - } else if (strcmp(optarg, "ava-pb") == 0) { - opt.flag |= MM_F_AVA | MM_F_NO_SELF; - opt.min_chain_score = 100, opt.pri_ratio = 0.0f, opt.max_gap = 10000, opt.max_chain_skip = 25; - minibatch_size = 500000000; - is_hpc = 1, k = 19, w = 5; - } else if (strcmp(optarg, "map10k") == 0 || strcmp(optarg, "map-pb") == 0) { - is_hpc = 1, k = 19; - } else if (strcmp(optarg, "map-ont") == 0) { - is_hpc = 0, k = 15; - } else if (strcmp(optarg, "asm5") == 0) { - k = 19, w = 19; - opt.a = 1, opt.b = 19, opt.q = 39, opt.q2 = 81, opt.e = 3, opt.e2 = 1, opt.zdrop = 200; - opt.min_dp_max = 200; - } else if (strcmp(optarg, "asm10") == 0) { - k = 19, w = 19; - opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200; - opt.min_dp_max = 200; - } else if (strcmp(optarg, "short") == 0 || strcmp(optarg, "sr") == 0) { - k = 21, w = 11, is_hpc = 0; - minibatch_size = 50000000; - opt.flag |= MM_F_APPROX_EXT; - opt.a = 2, opt.b = 8, opt.q = 12, opt.e = 2, opt.q2 = 32, opt.e2 = 1; - opt.max_gap = 100; - opt.pri_ratio = 0.5f; - opt.min_cnt = 2; - opt.min_chain_score = 20; - opt.min_dp_max = 40; - opt.best_n = 20; - opt.bw = 50; - opt.mid_occ = 1000; - } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { - k = 15, w = 5; - opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; - opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; - opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; - opt.noncan = 5; - opt.zdrop = 200; - } else { + if (mm_preset(optarg, &ipt, &opt) < 0) { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); return 1; } } } - if (w < 0) w = (int)(.6666667 * k + .499); if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0) opt.max_gap_ref = opt.bw = max_intron_len; @@ -197,8 +153,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, "Options:\n"); fprintf(fp_help, " Indexing:\n"); fprintf(fp_help, " -H use homopolymer-compressed k-mer\n"); - fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k); - fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n"); + fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k); + fprintf(fp_help, " -w INT minizer window size [%d]\n", ipt.w); fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n"); fprintf(fp_help, " Mapping:\n"); @@ -227,7 +183,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -c output CIGAR in PAF\n"); fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n"); fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); - fprintf(fp_help, " -K NUM minibatch size [200M]\n"); + fprintf(fp_help, " -K NUM minibatch size for mapping [200M]\n"); // fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose); fprintf(fp_help, " --version show version number\n"); fprintf(fp_help, " Preset:\n"); @@ -244,33 +200,22 @@ int main(int argc, char *argv[]) return fp_help == stdout? 0 : 1; } - is_idx = mm_idx_is_idx(argv[optind]); - if (is_idx < 0) { + idx_rdr = mm_idx_reader_open(argv[optind], &ipt); + if (idx_rdr == 0) { fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]); return 1; } - if (!is_idx && fnw == 0 && argc - optind < 2) { + if (!idx_rdr->is_idx && fnw == 0 && argc - optind < 2) { fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); return 1; } - if (is_idx) fpr = fopen(argv[optind], "rb"); - else fp = mm_bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); if (opt.flag & MM_F_OUT_SAM) mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv); for (;;) { mm_idx_t *mi; - if (fpr) { - mi = mm_idx_load(fpr); - if (mi == 0) break; - if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc)) - fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n"); - } else { - mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name); - } - if (mi == 0) break; - ++n_idx_part; - if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ)) + if ((mi = mm_idx_reader_read(idx_rdr, n_threads)) == 0) break; + if (mm_verbose >= 2 && idx_rdr->n_parts > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ)) fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n"); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", @@ -278,17 +223,15 @@ int main(int argc, char *argv[]) if (fpw) { mm_idx_dump(fpw, mi); if (mm_verbose >= 3) - fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the (partial) index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); + fprintf(stderr, "[M::%s::%.3f*%.2f] dumpped the current part of the index to disk\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); } if (argc != optind + 1) mm_mapopt_update(&opt, mi); if (mm_verbose >= 3) mm_idx_stat(mi); for (i = optind + 1; i < argc; ++i) - mm_map_file(mi, argv[i], &opt, n_threads, minibatch_size); + mm_map_file(mi, argv[i], &opt, n_threads); mm_idx_destroy(mi); } if (fpw) fclose(fpw); - if (fpr) fclose(fpr); - if (fp) mm_bseq_close(fp); fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); fprintf(stderr, "[M::%s] CMD:", __func__); diff --git a/map.c b/map.c index dad6423..46e820b 100644 --- a/map.c +++ b/map.c @@ -34,6 +34,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->zdrop = 400; opt->min_dp_max = opt->min_chain_score * opt->a; opt->min_ksw_len = 200; + opt->mini_batch_size = 200000000; } void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) @@ -49,6 +50,54 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) opt->mid_occ, opt->max_occ); } +int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) +{ + if (strcmp(preset, "ava-ont") == 0) { + io->is_hpc = 0, io->k = 15, io->w = 5; + mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->mini_batch_size = 500000000; + } else if (strcmp(preset, "ava-pb") == 0) { + io->is_hpc = 1, io->k = 19, io->w = 5; + mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + mo->mini_batch_size = 500000000; + } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { + io->is_hpc = 1, io->k = 19; + } else if (strcmp(preset, "map-ont") == 0) { + io->is_hpc = 0, io->k = 15; + } else if (strcmp(preset, "asm5") == 0) { + io->is_hpc = 0, io->k = 19, io->w = 19; + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200; + mo->min_dp_max = 200; + } else if (strcmp(preset, "asm10") == 0) { + io->is_hpc = 0, io->k = 19, io->w = 19; + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200; + mo->min_dp_max = 200; + } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { + io->is_hpc = 0, io->k = 21, io->w = 11; + mo->flag |= MM_F_APPROX_EXT; + mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1; + mo->max_gap = 100; + mo->pri_ratio = 0.5f; + mo->min_cnt = 2; + mo->min_chain_score = 20; + mo->min_dp_max = 40; + mo->best_n = 20; + mo->bw = 50; + mo->mid_occ = 1000; + mo->mini_batch_size = 50000000; + } else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) { + io->is_hpc = 0, io->k = 15, io->w = 5; + mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; + mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; + mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; + mo->noncan = 5; + mo->zdrop = 200; + } else return -1; + return 0; +} + typedef struct { uint32_t n; uint32_t qpos; @@ -304,7 +353,7 @@ static void *worker_pipeline(void *shared, int step, void *in) return 0; } -int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int mini_batch_size) +int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads) { pipeline_t pl; memset(&pl, 0, sizeof(pipeline_t)); @@ -315,7 +364,7 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int return -1; } pl.opt = opt, pl.mi = idx; - pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; + pl.n_threads = n_threads, pl.mini_batch_size = opt->mini_batch_size; if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ)) mm_write_sam_SQ(idx); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); diff --git a/minimap.h b/minimap.h index 4add2fe..c825c69 100644 --- a/minimap.h +++ b/minimap.h @@ -5,8 +5,6 @@ #include #include -#define MM_IDX_DEF_B 14 - #define MM_F_NO_SELF 0x001 #define MM_F_AVA 0x002 #define MM_F_CIGAR 0x004 @@ -105,46 +103,65 @@ typedef struct { int min_dp_max; int min_ksw_len; + int mini_batch_size; int32_t max_occ, mid_occ; } mm_mapopt_t; +typedef struct { + short k, w, is_hpc, bucket_bits; + int mini_batch_size; + uint64_t batch_size; +} mm_idxopt_t; + +struct mm_bseq_file_s; + +typedef struct { + int is_idx, n_parts; + mm_idxopt_t opt; + union { + struct mm_bseq_file_s *seq; + FILE *idx; + } fp; +} mm_idx_reader_t; + extern int mm_verbose, mm_dbg_flag; extern double mm_realtime0; struct mm_tbuf_s; typedef struct mm_tbuf_s mm_tbuf_t; -struct mm_bseq_file_s; - #define mm_seq4_set(s, i, c) ((s)[(i)>>3] |= (uint32_t)(c) << (((i)&7)<<2)) #define mm_seq4_get(s, i) ((s)[(i)>>3] >> (((i)&7)<<2) & 0xf) -// compute minimizers -void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); +void mm_idxopt_init(mm_idxopt_t *opt); +void mm_mapopt_init(mm_mapopt_t *opt); +int mm_preset(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo); +void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi); + +mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt); +int mm_idx_reader_is_idx(const mm_idx_reader_t *r); +mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads); +void mm_idx_reader_close(mm_idx_reader_t *r); +void mm_idx_destroy(mm_idx_t *mi); // minimizer indexing -mm_idx_t *mm_idx_init(int w, int k, int b, int is_hpc); -void mm_idx_destroy(mm_idx_t *mi); -mm_idx_t *mm_idx_gen(struct mm_bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name); void mm_idx_stat(const mm_idx_t *idx); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); -mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads); -int mm_idx_is_idx(const char *fn); - // minimizer index I/O void mm_idx_dump(FILE *fp, const mm_idx_t *mi); mm_idx_t *mm_idx_load(FILE *fp); // mapping -void mm_mapopt_init(mm_mapopt_t *opt); -void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi); mm_tbuf_t *mm_tbuf_init(void); void mm_tbuf_destroy(mm_tbuf_t *b); mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); -int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size); +int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads); + +// obsolete APIs (for backward compatibility) +mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads); #ifdef __cplusplus } diff --git a/mmpriv.h b/mmpriv.h index 11fba89..29141e2 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,6 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); +void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); + void mm_write_sam_SQ(const mm_idx_t *idx); void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);