diff --git a/Makefile b/Makefile index c142c0b..1bfc70a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= -DHAVE_KALLOC INCLUDES= -OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread @@ -98,9 +98,12 @@ ksw2_extd2_sse.o: ksw2.h kalloc.h ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h +kthread.o: kthread.h main.o: bseq.h minimap.h mmpriv.h getopt.h map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h khash.h -misc.o: minimap.h ksort.h +map.o: ksort.h +misc.o: mmpriv.h minimap.h bseq.h ksort.h +options.o: mmpriv.h minimap.h bseq.h pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h sdust.o: kalloc.h kdq.h kvec.h sdust.h -sketch.o: kvec.h kalloc.h minimap.h +sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h diff --git a/main.c b/main.c index 8779282..dce85c5 100644 --- a/main.c +++ b/main.c @@ -47,6 +47,7 @@ static struct option long_options[] = { { "end-seed-pen", required_argument, 0, 0 }, // 21 { "for-only", no_argument, 0, 0 }, // 22 { "rev-only", no_argument, 0, 0 }, // 23 + { "heap-sort", optional_argument, 0, 0 }, // 24 { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -169,6 +170,10 @@ int main(int argc, char *argv[]) if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) opt.flag |= MM_F_SPLICE_FLANK; else opt.flag &= ~MM_F_SPLICE_FLANK; + } else if (c == 0 && long_idx == 24) { // --heap-sort + if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) + opt.flag |= MM_F_HEAP_SORT; + else opt.flag &= ~MM_F_HEAP_SORT; } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) diff --git a/map.c b/map.c index a4e0d2a..afdc31e 100644 --- a/map.c +++ b/map.c @@ -9,144 +9,6 @@ #include "bseq.h" #include "khash.h" -void mm_mapopt_init(mm_mapopt_t *opt) -{ - memset(opt, 0, sizeof(mm_mapopt_t)); - opt->seed = 11; - opt->mid_occ_frac = 2e-4f; - opt->sdust_thres = 0; // no SDUST masking - - opt->min_cnt = 3; - opt->min_chain_score = 40; - opt->bw = 500; - opt->max_gap = 5000; - opt->max_gap_ref = -1; - opt->max_chain_skip = 25; - - opt->mask_level = 0.5f; - opt->pri_ratio = 0.8f; - opt->best_n = 5; - - opt->max_join_long = 20000; - opt->max_join_short = 2000; - opt->min_join_flank_sc = 1000; - - opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; - opt->zdrop = 400; - opt->end_bonus = -1; - opt->min_dp_max = opt->min_chain_score * opt->a; - opt->min_ksw_len = 200; - opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; - opt->mini_batch_size = 500000000; - - opt->pe_ori = 0; // FF - opt->pe_bonus = 33; -} - -void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) -{ - if ((opt->flag & MM_F_SPLICE_FOR) && (opt->flag & MM_F_SPLICE_REV)) - opt->flag |= MM_F_SPLICE; - if (opt->mid_occ <= 0) - opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); - if (mm_verbose >= 3) - fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); -} - -void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len) -{ - if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0) - opt->max_gap_ref = opt->bw = max_intron_len; -} - -int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) -{ - if (preset == 0) { - mm_idxopt_init(io); - mm_mapopt_init(mo); - } else if (strcmp(preset, "ava-ont") == 0) { - io->flag = 0, io->k = 15, io->w = 5; - mo->flag |= MM_F_AVA | MM_F_NO_SELF; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - } else if (strcmp(preset, "ava-pb") == 0) { - io->flag |= MM_I_HPC, io->k = 19, io->w = 5; - mo->flag |= MM_F_AVA | MM_F_NO_SELF; - mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { - io->flag |= MM_I_HPC, io->k = 19; - } else if (strcmp(preset, "map-ont") == 0) { - io->flag = 0, io->k = 15; - } else if (strcmp(preset, "asm5") == 0) { - io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm10") == 0) { - io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { - io->flag = 0, io->k = 21, io->w = 11; - mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS; - mo->pe_ori = 0<<1|1; // FR - mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; - mo->zdrop = 100; - mo->end_bonus = 10; - mo->max_frag_len = 800; - mo->max_gap = 100; - mo->bw = 100; - mo->pri_ratio = 0.5f; - mo->min_cnt = 2; - mo->min_chain_score = 25; - mo->min_dp_max = 40; - mo->best_n = 20; - mo->mid_occ = 1000; - mo->max_occ = 5000; - mo->mini_batch_size = 50000000; - } else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) { - io->flag = 0, io->k = 15, io->w = 5; - mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; - mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; - mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; - mo->noncan = 9; - mo->zdrop = 200; - } else return -1; - return 0; -} - -int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) -{ - if (mo->best_n < 0) { - if (mm_verbose >= 1) - fprintf(stderr, "[ERROR]\033[1;31m -N must be no less than 0\033[0m\n"); - return -4; - } - if (mo->best_n == 0 && mm_verbose >= 2) - fprintf(stderr, "[WARNING]\033[1;31m '-N 0' reduces mapping accuracy. Please use '--secondary=no' instead.\033[0m\n"); - if (mo->pri_ratio < 0.0f || mo->pri_ratio > 1.0f) { - if (mm_verbose >= 1) - fprintf(stderr, "[ERROR]\033[1;31m -p must be within 0 and 1 (including 0 and 1)\033[0m\n"); - return -4; - } - if ((mo->flag & MM_F_FOR_ONLY) && (mo->flag & MM_F_REV_ONLY)) { - if (mm_verbose >= 1) - fprintf(stderr, "[ERROR]\033[1;31m --for-only and --rev-only can't be applied at the same time\033[0m\n"); - return -3; - } - if ((mo->q != mo->q2 || mo->e != mo->e2) && !(mo->e > mo->e2 && mo->q + mo->e < mo->q2 + mo->e2)) { - if (mm_verbose >= 1) - fprintf(stderr, "[ERROR]\033[1;31m dual gap penalties violating E1>E2 and O1+E1q + mo->e) + (mo->q2 + mo->e2) > 127) { - if (mm_verbose >= 1) - fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n"); - return -1; - } - return 0; -} - struct mm_tbuf_s { void *km; }; @@ -436,7 +298,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** hash = __ac_Wang_hash(hash); collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv); - a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); if (mm_dbg_flag & MM_DBG_PRINT_SEED) { fprintf(stderr, "RS\t%d\n", rep_len); @@ -476,7 +339,8 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, a); kfree(b->km, u); kfree(b->km, mini_pos); - a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); + else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } diff --git a/minimap.h b/minimap.h index 26cf187..b244639 100644 --- a/minimap.h +++ b/minimap.h @@ -27,6 +27,7 @@ #define MM_F_SOFTCLIP 0x80000 #define MM_F_FOR_ONLY 0x100000 #define MM_F_REV_ONLY 0x200000 +#define MM_F_HEAP_SORT 0x400000 #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/options.c b/options.c new file mode 100644 index 0000000..edf75de --- /dev/null +++ b/options.c @@ -0,0 +1,140 @@ +#include +#include "mmpriv.h" + +void mm_mapopt_init(mm_mapopt_t *opt) +{ + memset(opt, 0, sizeof(mm_mapopt_t)); + opt->seed = 11; + opt->mid_occ_frac = 2e-4f; + opt->sdust_thres = 0; // no SDUST masking + + opt->min_cnt = 3; + opt->min_chain_score = 40; + opt->bw = 500; + opt->max_gap = 5000; + opt->max_gap_ref = -1; + opt->max_chain_skip = 25; + + opt->mask_level = 0.5f; + opt->pri_ratio = 0.8f; + opt->best_n = 5; + + opt->max_join_long = 20000; + opt->max_join_short = 2000; + opt->min_join_flank_sc = 1000; + + opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1; + opt->zdrop = 400; + opt->end_bonus = -1; + opt->min_dp_max = opt->min_chain_score * opt->a; + opt->min_ksw_len = 200; + opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; + opt->mini_batch_size = 500000000; + + opt->pe_ori = 0; // FF + opt->pe_bonus = 33; +} + +void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) +{ + if ((opt->flag & MM_F_SPLICE_FOR) && (opt->flag & MM_F_SPLICE_REV)) + opt->flag |= MM_F_SPLICE; + if (opt->mid_occ <= 0) + opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); + if (mm_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); +} + +void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len) +{ + if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0) + opt->max_gap_ref = opt->bw = max_intron_len; +} + +int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) +{ + if (preset == 0) { + mm_idxopt_init(io); + mm_mapopt_init(mo); + } else if (strcmp(preset, "ava-ont") == 0) { + io->flag = 0, io->k = 15, io->w = 5; + mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + } else if (strcmp(preset, "ava-pb") == 0) { + io->flag |= MM_I_HPC, io->k = 19, io->w = 5; + mo->flag |= MM_F_AVA | MM_F_NO_SELF; + mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; + } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { + io->flag |= MM_I_HPC, io->k = 19; + } else if (strcmp(preset, "map-ont") == 0) { + io->flag = 0, io->k = 15; + } else if (strcmp(preset, "asm5") == 0) { + io->flag = 0, io->k = 19, io->w = 19; + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = 200; + mo->min_dp_max = 200; + mo->best_n = 50; + } else if (strcmp(preset, "asm10") == 0) { + io->flag = 0, io->k = 19, io->w = 19; + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = 200; + mo->min_dp_max = 200; + mo->best_n = 50; + } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { + io->flag = 0, io->k = 21, io->w = 11; + mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS; + mo->pe_ori = 0<<1|1; // FR + mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; + mo->zdrop = 100; + mo->end_bonus = 10; + mo->max_frag_len = 800; + mo->max_gap = 100; + mo->bw = 100; + mo->pri_ratio = 0.5f; + mo->min_cnt = 2; + mo->min_chain_score = 25; + mo->min_dp_max = 40; + mo->best_n = 20; + mo->mid_occ = 1000; + mo->max_occ = 5000; + mo->mini_batch_size = 50000000; + } else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) { + io->flag = 0, io->k = 15, io->w = 5; + mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; + mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; + mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; + mo->noncan = 9; + mo->zdrop = 200; + } else return -1; + return 0; +} + +int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) +{ + if (mo->best_n < 0) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m -N must be no less than 0\033[0m\n"); + return -4; + } + if (mo->best_n == 0 && mm_verbose >= 2) + fprintf(stderr, "[WARNING]\033[1;31m '-N 0' reduces mapping accuracy. Please use '--secondary=no' instead.\033[0m\n"); + if (mo->pri_ratio < 0.0f || mo->pri_ratio > 1.0f) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m -p must be within 0 and 1 (including 0 and 1)\033[0m\n"); + return -4; + } + if ((mo->flag & MM_F_FOR_ONLY) && (mo->flag & MM_F_REV_ONLY)) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m --for-only and --rev-only can't be applied at the same time\033[0m\n"); + return -3; + } + if ((mo->q != mo->q2 || mo->e != mo->e2) && !(mo->e > mo->e2 && mo->q + mo->e < mo->q2 + mo->e2)) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m dual gap penalties violating E1>E2 and O1+E1q + mo->e) + (mo->q2 + mo->e2) > 127) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m scoring system violating ({-O}+{-E})+({-O2}+{-E2}) <= 127\033[0m\n"); + return -1; + } + return 0; +} diff --git a/setup.py b/setup.py index 13365e7..9ed5556 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( keywords = 'sequence-alignment', scripts = ['python/minimap2.py'], ext_modules = [Extension('mappy', - sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', + sources = [module_src, 'align.c', 'bseq.c', 'chain.c', 'format.c', 'hit.c', 'index.c', 'pe.c', 'options.c', 'ksw2_extd2_sse.c', 'ksw2_exts2_sse.c', 'ksw2_extz2_sse.c', 'ksw2_ll_sse.c', 'kalloc.c', 'kthread.c', 'map.c', 'misc.c', 'sdust.c', 'sketch.c', 'esterr.c'], depends = ['minimap.h', 'bseq.h', 'kalloc.h', 'kdq.h', 'khash.h', 'kseq.h', 'ksort.h',