From eeeb2ffb6852c0e74b9baf551b39eec62fc61ff2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 12 Jul 2017 10:08:06 -0400 Subject: [PATCH] r174: make max-chain-skip work The max-chain-skip heuristics did not work due to a bug. Without this heuristics, chaining is too slow for long-read overlap. --- chain.c | 14 ++++++++------ main.c | 4 +++- map.c | 2 +- minimap2.1 | 19 ++++++++++++++++++- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/chain.c b/chain.c index 3887053..0706a71 100644 --- a/chain.c +++ b/chain.c @@ -42,11 +42,6 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int int64_t dr = ri - a[j].x; int32_t dq = qi - (int32_t)a[j].y, dd, sc; if (dr == 0 || dq <= 0 || dq > max_dist) continue; - if (t[j] == i) { // FIXME: never enter this block. i.e. max_skip has no effect at all. This may affect speed. - if (p[j] >= 0) t[p[j]] = i; - if (++n_skip > max_skip) break; - continue; - } dd = dr > dq? dr - dq : dq - dr; if (dd > bw) continue; min_d = dq < dr? dq : dr; @@ -54,7 +49,14 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int sc -= dd? ilog2_32(dd) * 2 : 0; if (min_d > q_span) sc -= ilog2_32(min_d) / 2; sc += f[j]; - if (sc > max_f) max_f = sc, max_j = j; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; } if (max_j >= 0) f[i] = max_f, p[i] = max_j; else f[i] = q_span, p[i] = -1; diff --git a/main.c b/main.c index 71b646e..e562756 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r173-pre" +#define MM_VERSION "2.0-r174-pre" void liftrlimit() { @@ -49,6 +49,7 @@ static struct option long_options[] = { { "print-qname", no_argument, 0, 0 }, { "no-self", no_argument, 0, 0 }, { "print-seed", no_argument, 0, 0 }, + { "max-chain-skip", required_argument, 0, 0 }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, { "min-chain-score",required_argument, 0, 'm' }, @@ -103,6 +104,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self else if (c == 0 && long_idx == 6) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED; // --print-seed + else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip else if (c == 'V') { puts(MM_VERSION); return 0; diff --git a/map.c b/map.c index a08cb1d..f883632 100644 --- a/map.c +++ b/map.c @@ -19,7 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_chain_score = 40; opt->bw = 500; opt->max_gap = 5000; - opt->max_chain_skip = 15; + opt->max_chain_skip = 50; opt->min_seedcov_ratio = 0.0f; opt->mask_level = 0.5f; diff --git a/minimap2.1 b/minimap2.1 index c7a6b67..4612b9d 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "11 July 2017" "minimap2-2.0-r172-pre" "Bioinformatics tools" +.TH minimap2 1 "12 July 2017" "minimap2-2.0-r174-pre" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -192,6 +192,14 @@ PacBio/Oxford Nanopore read to reference mapping (-Hk19) .B asm1m Long assembly to reference mapping (-k19 -w19) .RE +.TP +.BI --max-chain-skip \ INT +A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming +for chaining. The time complexity is quadratic in the number of seeds. This +option makes minimap2 exits the inner loop if it repeatedly sees seeds already +on chains. Set +.I INT +to a large number to switch off this heurstics. .SS Alignment options .TP 10 .BI -A \ INT @@ -243,6 +251,12 @@ takes little CPU time). .TP .B -V Print version number to stdout +.TP +.BI --mb-size \ STR +Number of bases loaded into memory to process in a mini-batch [200M]. +Similar to option +.BR -I , +K/M/G/k/m/g suffix is accepted. This option affects both indexing and mapping. .SS Miscellaneous options .TP 10 .B --no-kalloc @@ -253,6 +267,9 @@ multi-threading mode. .TP .B --print-qname Print query names to stderr, mostly to see which query is crashing minimap2. +.TP +.B --print-seed +Print seed positions to stderr, for debugging only. .SH OUTPUT FORMAT .PP Minimap2 outputs mapping positions in the Pairwise mApping Format (PAF) by