r917: added --max-chain-iter to avoid worst case

Resolves #324
This commit is contained in:
Heng Li 2019-02-27 14:41:01 -05:00
parent ccf1680aaf
commit d431dc0181
8 changed files with 20 additions and 9 deletions

View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j, st = 0;
@ -49,6 +49,7 @@ mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int m
int32_t max_f = q_span, n_skip = 0, min_d;
int32_t sidi = (a[i].y & MM_SEED_SEG_MASK) >> MM_SEED_SEG_SHIFT;
while (st < i && ri > a[st].x + max_dist_x) ++st;
if (i - st > max_iter) st = i - max_iter;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc, log_dd;

2
main.c
View File

@ -62,6 +62,7 @@ static ko_longopt_t long_options[] = {
{ "hard-mask-level",ko_no_argument, 336 },
{ "cap-sw-mem", ko_required_argument, 337 },
{ "max-qlen", ko_required_argument, 338 },
{ "max-chain-iter", ko_required_argument, 339 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@ -179,6 +180,7 @@ int main(int argc, char *argv[])
else if (c == 304) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname
else if (c == 306) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED, n_threads = 1; // --print-seed
else if (c == 307) opt.max_chain_skip = atoi(o.arg); // --max-chain-skip
else if (c == 339) opt.max_chain_iter = atoi(o.arg); // --max-chain-iter
else if (c == 308) opt.min_ksw_len = atoi(o.arg); // --min-dp-len
else if (c == 309) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ, n_threads = 1; // --print-aln-seq
else if (c == 310) opt.flag |= MM_F_SPLICE; // --splice

4
map.c
View File

@ -312,7 +312,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
if (max_chain_gap_ref < opt->max_gap) max_chain_gap_ref = opt->max_gap;
} else max_chain_gap_ref = opt->max_gap;
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
if (opt->max_occ > opt->mid_occ && rep_len > 0) {
int rechain = 0;
@ -334,7 +334,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
kfree(b->km, mini_pos);
if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
else a = collect_seed_hits(b->km, opt, opt->max_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
a = mm_chain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_chain_skip, opt->max_chain_iter, opt->min_cnt, opt->min_chain_score, is_splice, n_segs, n_a, a, &n_regs0, &u, b->km);
}
}
b->frag_gap = max_chain_gap_ref;

View File

@ -112,7 +112,7 @@ typedef struct {
int bw; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "5 Feburary 2019" "minimap2-2.15-dirty (r913)" "Bioinformatics tools"
.TH minimap2 1 "27 Feburary 2019" "minimap2-2.15-dirty (r917)" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -232,13 +232,19 @@ Honor option
and disable a heurstic to save unmapped subsequences.
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
A heuristics that stops chaining early [25]. Minimap2 uses dynamic programming
for chaining. The time complexity is quadratic in the number of seeds. This
option makes minimap2 exits the inner loop if it repeatedly sees seeds already
on chains. Set
.I INT
to a large number to switch off this heurstics.
.TP
.BI --max-chain-iter \ INT
Check up to
.I INT
partial chains during chaining [5000]. This is a heuristic to avoid quadratic
time complexity in the worst case.
.TP
.B --no-long-join
Disable the long gap patching heuristic. When this option is applied, the
maximum alignment gap is mostly controlled by

View File

@ -69,7 +69,7 @@ void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);

View File

@ -23,6 +23,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->max_gap = 5000;
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->max_chain_iter = 5000;
opt->mask_level = 0.5f;
opt->pri_ratio = 0.8f;

View File

@ -13,10 +13,11 @@ cdef extern from "minimap.h":
int seed
int sdust_thres
int flag
int max_qlen
int bw
int max_gap, max_gap_ref
int max_frag_len
int max_chain_skip
int max_chain_skip, max_chain_iter
int min_cnt
int min_chain_score
float mask_level
@ -24,7 +25,7 @@ cdef extern from "minimap.h":
int best_n
int max_join_long, max_join_short
int min_join_flank_sc
float min_join_flank_ratio;
float min_join_flank_ratio
int a, b, q, e, q2, e2
int sc_ambi
int noncan