diff --git a/chain.c b/chain.c index f7c05cd..be9d51b 100644 --- a/chain.c +++ b/chain.c @@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v) return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; } -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km) { // TODO: make sure this works when n has more than 32 bits int32_t k, *f, *p, *t, *v, n_u, n_v; int64_t i, j, st = 0; @@ -49,6 +49,7 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m int32_t max_f = q_span, n_skip = 0, min_d; int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT; while (st < i && ri > a[st].x + max_dist_x) ++st; + if (i - st > max_iter) st = i - max_iter; for (j = i - 1; j >= st; --j) { int64_t dr = ri - a[j].x; int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd; diff --git a/main.c b/main.c index 34c4696..6ff3a4b 100644 --- a/main.c +++ b/main.c @@ -62,6 +62,7 @@ static ko_longopt_t long_options[] = { { "hard-mask-level",ko_no_argument, 336 }, { "cap-sw-mem", ko_required_argument, 337 }, { "max-qlen", ko_required_argument, 338 }, + { "max-chain-iter", ko_required_argument, 339 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -179,6 +180,7 @@ int main(int argc, char *argv[]) else if (c == 304) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname else if (c == 306) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED, n_threads = 1; // --print-seed else if (c == 307) opt.max_chain_skip = atoi(o.arg); // --max-chain-skip + else if (c == 339) opt.max_chain_iter = atoi(o.arg); // --max-chain-iter else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice diff --git a/map.c b/map.c index 0f1bac2..41de91e 100644 --- a/map.c +++ b/map.c @@ -312,7 +312,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap; } else max_chain_gap_ref = opt->max_gap; - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); if (opt->max_occ > opt->mid_occ && rep_len > 0) { int rechain = 0; @@ -334,7 +334,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** kfree(b->km, mini_pos); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); - a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); + a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km); } } b->frag_gap = max_chain_gap_ref; diff --git a/minimap.h b/minimap.h index 0b44325..176342b 100644 --- a/minimap.h +++ b/minimap.h @@ -112,7 +112,7 @@ typedef struct { int bw; // bandwidth int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_frag_len; - int max_chain_skip; + int max_chain_skip, max_chain_iter; int min_cnt; // min number of minimizers on each chain int min_chain_score; // min chaining score diff --git a/minimap2.1 b/minimap2.1 index 016af5e..ca5f4ba 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "5 Feburary 2019" "minimap2-2.15-dirty (r913)" "Bioinformatics tools" +.TH minimap2 1 "27 Feburary 2019" "minimap2-2.15-dirty (r917)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -232,13 +232,19 @@ Honor option and disable a heurstic to save unmapped subsequences. .TP .BI --max-chain-skip \ INT -A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming +A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This option makes minimap2 exits the inner loop if it repeatedly sees seeds already on chains. Set .I INT to a large number to switch off this heurstics. .TP +.BI --max-chain-iter \ INT +Check up to +.I INT +partial chains during chaining [5000]. This is a heuristic to avoid quadratic +time complexity in the worst case. +.TP .B --no-long-join Disable the long gap patching heuristic. When this option is applied, the maximum alignment gap is mostly controlled by diff --git a/mmpriv.h b/mmpriv.h index a059701..c9b91cd 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -69,7 +69,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se void mm_idxopt_init(mm_idxopt_t *opt); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); -mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); +mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); diff --git a/options.c b/options.c index b6d9004..eac73e3 100644 --- a/options.c +++ b/options.c @@ -23,6 +23,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->max_gap = 5000; opt->max_gap_ref = -1; opt->max_chain_skip = 25; + opt->max_chain_iter = 5000; opt->mask_level = 0.5f; opt->pri_ratio = 0.8f; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index 5a3927e..1bbe749 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -13,10 +13,11 @@ cdef extern from "minimap.h": int seed int sdust_thres int flag + int max_qlen int bw int max_gap, max_gap_ref int max_frag_len - int max_chain_skip + int max_chain_skip, max_chain_iter int min_cnt int min_chain_score float mask_level @@ -24,7 +25,7 @@ cdef extern from "minimap.h": int best_n int max_join_long, max_join_short int min_join_flank_sc - float min_join_flank_ratio; + float min_join_flank_ratio int a, b, q, e, q2, e2 int sc_ambi int noncan