From bbb4f97e52955e9574b516e211b91c02052432e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 3 May 2021 09:27:04 -0400 Subject: [PATCH] support RMQ --- main.c | 3 +++ map.c | 11 ++++++++--- minimap.h | 2 ++ options.c | 47 ++++++++++++++++++++++++++--------------------- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/main.c b/main.c index 5234e86..246e0f6 100644 --- a/main.c +++ b/main.c @@ -71,6 +71,7 @@ static ko_longopt_t long_options[] = { { "alt", ko_required_argument, 344 }, { "alt-drop", ko_required_argument, 345 }, { "mask-len", ko_required_argument, 346 }, + { "rmq", ko_optional_argument, 347 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -241,6 +242,8 @@ int main(int argc, char *argv[]) yes_or_no(&opt, MM_F_HEAP_SORT, o.longidx, o.arg, 1); } else if (c == 326) { // --dual yes_or_no(&opt, MM_F_NO_DUAL, o.longidx, o.arg, 0); + } else if (c == 347) { // --rmq + yes_or_no(&opt, MM_F_RMQ, o.longidx, o.arg, 1); } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) diff --git a/map.c b/map.c index 48d1f09..fd648c4 100644 --- a/map.c +++ b/map.c @@ -271,10 +271,15 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, - opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + if (opt->flag & MM_F_RMQ) { + a = mg_lchain_rmq(opt->max_gap, opt->rmq_inner_dist, opt->bw, opt->max_chain_skip, opt->rmq_size_cap, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, n_a, a, &n_regs0, &u, b->km); + } else { + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, + opt->chain_gap_scale * 0.2f, 0.0f, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + } - if (opt->max_occ > opt->mid_occ && rep_len > 0) { + if (opt->max_occ > opt->mid_occ && rep_len > 0 && !(opt->flag & MM_F_RMQ)) { int rechain = 0; if (n_regs0 > 0) { // test if the best chain has all the segments int n_chained_segs = 1, max = 0, max_i = -1, max_off = -1, off = 0; diff --git a/minimap.h b/minimap.h index 543a0ed..281ca3d 100644 --- a/minimap.h +++ b/minimap.h @@ -36,6 +36,7 @@ #define MM_F_NO_END_FLT 0x10000000 #define MM_F_HARD_MLEVEL 0x20000000 #define MM_F_SAM_HIT_ONLY 0x40000000 +#define MM_F_RMQ 0x80000000LL #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 @@ -120,6 +121,7 @@ typedef struct { int min_cnt; // min number of minimizers on each chain int min_chain_score; // min chaining score float chain_gap_scale; + int rmq_size_cap, rmq_inner_dist; float mask_level; int mask_len; diff --git a/options.c b/options.c index 2044b62..4bba3ec 100644 --- a/options.c +++ b/options.c @@ -22,13 +22,15 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_cnt = 3; opt->min_chain_score = 40; opt->bw = 500; - opt->max_gap = 5000; + opt->max_gap = 10000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->max_chain_iter = 5000; + opt->rmq_inner_dist = 1000; + opt->rmq_size_cap = 100000; opt->chain_gap_scale = 1.0f; opt->max_max_occ = 4095; - opt->occ_dist = 0; + opt->occ_dist = 500; opt->mask_level = 0.5f; opt->mask_len = INT_MAX; @@ -51,6 +53,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6; opt->max_clip_ratio = 1.0f; opt->mini_batch_size = 500000000; + opt->max_sw_mat = 100000000; opt->pe_ori = 0; // FF opt->pe_bonus = 33; @@ -81,19 +84,18 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) if (preset == 0) { mm_idxopt_init(io); mm_mapopt_init(mo); + } else if (strcmp(preset, "map-ont") == 0) { // this is the same as the default } else if (strcmp(preset, "ava-ont") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; mo->bw = 2000; + } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { + io->flag |= MM_I_HPC, io->k = 19; } else if (strcmp(preset, "ava-pb") == 0) { io->flag |= MM_I_HPC, io->k = 19, io->w = 5; mo->flag |= MM_F_ALL_CHAINS | MM_F_NO_DIAG | MM_F_NO_DUAL | MM_F_NO_LJOIN; mo->min_chain_score = 100, mo->pri_ratio = 0.0f, mo->max_gap = 10000, mo->max_chain_skip = 25; - } else if (strcmp(preset, "map10k") == 0 || strcmp(preset, "map-pb") == 0) { - io->flag |= MM_I_HPC, io->k = 19; - } else if (strcmp(preset, "map-ont") == 0) { - io->flag = 0, io->k = 15; } else if (strcmp(preset, "map-hifi") == 0 || strcmp(preset, "map-ccs") == 0) { io->flag = 0, io->k = 19, io->w = 19; mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1; @@ -101,24 +103,21 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->occ_dist = 500; mo->min_mid_occ = 100, mo->max_mid_occ = 500; mo->min_dp_max = 200; - } else if (strcmp(preset, "asm5") == 0) { + } else if (strncmp(preset, "asm", 3) == 0) { io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm10") == 0) { - io->flag = 0, io->k = 19, io->w = 19; - mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; - mo->min_dp_max = 200; - mo->best_n = 50; - } else if (strcmp(preset, "asm20") == 0) { - io->flag = 0, io->k = 19, io->w = 10; - mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; - mo->min_mid_occ = 100; + mo->bw = 100000; + mo->flag |= MM_F_RMQ | MM_F_NO_LJOIN; + mo->min_mid_occ = 100, mo->max_mid_occ = 500; mo->min_dp_max = 200; mo->best_n = 50; + if (strcmp(preset, "asm5") == 0) { + mo->a = 1, mo->b = 19, mo->q = 39, mo->q2 = 81, mo->e = 3, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm10") == 0) { + mo->a = 1, mo->b = 9, mo->q = 16, mo->q2 = 41, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + } else if (strcmp(preset, "asm20") == 0) { + mo->a = 1, mo->b = 4, mo->q = 6, mo->q2 = 26, mo->e = 2, mo->e2 = 1, mo->zdrop = mo->zdrop_inv = 200; + io->w = 10; + } else return -1; } else if (strcmp(preset, "short") == 0 || strcmp(preset, "sr") == 0) { io->flag = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS | MM_F_HEAP_SORT; @@ -140,6 +139,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) } else if (strncmp(preset, "splice", 6) == 0 || strcmp(preset, "cdna") == 0) { io->flag = 0, io->k = 15, io->w = 5; mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK; + mo->max_sw_mat = 0; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; mo->noncan = 9; @@ -153,6 +153,11 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo) { + if ((mo->flag & MM_F_RMQ) && (mo->flag & (MM_F_SR|MM_F_SPLICE))) { + if (mm_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m --rmq doesn't work with --sr or --splice\033[0m\n"); + return -7; + } if (mo->split_prefix && (mo->flag & (MM_F_OUT_CS|MM_F_OUT_MD))) { if (mm_verbose >= 1) fprintf(stderr, "[ERROR]\033[1;31m --cs or --MD doesn't work with --split-prefix\033[0m\n");