r174: make max-chain-skip work

The max-chain-skip heuristics did not work due to a bug. Without this
heuristics, chaining is too slow for long-read overlap.
This commit is contained in:
Heng Li 2017-07-12 10:08:06 -04:00
parent 33451aba45
commit eeeb2ffb68
4 changed files with 30 additions and 9 deletions

14
chain.c
View File

@ -42,11 +42,6 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc;
if (dr == 0 || dq <= 0 || dq > max_dist) continue;
if (t[j] == i) { // FIXME: never enter this block. i.e. max_skip has no effect at all. This may affect speed.
if (p[j] >= 0) t[p[j]] = i;
if (++n_skip > max_skip) break;
continue;
}
dd = dr > dq? dr - dq : dq - dr;
if (dd > bw) continue;
min_d = dq < dr? dq : dr;
@ -54,7 +49,14 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
sc -= dd? ilog2_32(dd) * 2 : 0;
if (min_d > q_span) sc -= ilog2_32(min_d) / 2;
sc += f[j];
if (sc > max_f) max_f = sc, max_j = j;
if (sc > max_f) {
max_f = sc, max_j = j;
if (n_skip > 0) --n_skip;
} else if (t[j] == i) {
if (++n_skip > max_skip)
break;
}
if (p[j] >= 0) t[p[j]] = i;
}
if (max_j >= 0) f[i] = max_f, p[i] = max_j;
else f[i] = q_span, p[i] = -1;

4
main.c
View File

@ -10,7 +10,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r173-pre"
#define MM_VERSION "2.0-r174-pre"
void liftrlimit()
{
@ -49,6 +49,7 @@ static struct option long_options[] = {
{ "print-qname", no_argument, 0, 0 },
{ "no-self", no_argument, 0, 0 },
{ "print-seed", no_argument, 0, 0 },
{ "max-chain-skip", required_argument, 0, 0 },
{ "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' },
{ "min-chain-score",required_argument, 0, 'm' },
@ -103,6 +104,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 4) mm_dbg_flag |= MM_DBG_PRINT_QNAME; // --print-qname
else if (c == 0 && long_idx == 5) opt.flag |= MM_F_NO_SELF; // --no-self
else if (c == 0 && long_idx == 6) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED; // --print-seed
else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
else if (c == 'V') {
puts(MM_VERSION);
return 0;

2
map.c
View File

@ -19,7 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->min_chain_score = 40;
opt->bw = 500;
opt->max_gap = 5000;
opt->max_chain_skip = 15;
opt->max_chain_skip = 50;
opt->min_seedcov_ratio = 0.0f;
opt->mask_level = 0.5f;

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "11 July 2017" "minimap2-2.0-r172-pre" "Bioinformatics tools"
.TH minimap2 1 "12 July 2017" "minimap2-2.0-r174-pre" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -192,6 +192,14 @@ PacBio/Oxford Nanopore read to reference mapping (-Hk19)
.B asm1m
Long assembly to reference mapping (-k19 -w19)
.RE
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
for chaining. The time complexity is quadratic in the number of seeds. This
option makes minimap2 exits the inner loop if it repeatedly sees seeds already
on chains. Set
.I INT
to a large number to switch off this heurstics.
.SS Alignment options
.TP 10
.BI -A \ INT
@ -243,6 +251,12 @@ takes little CPU time).
.TP
.B -V
Print version number to stdout
.TP
.BI --mb-size \ STR
Number of bases loaded into memory to process in a mini-batch [200M].
Similar to option
.BR -I ,
K/M/G/k/m/g suffix is accepted. This option affects both indexing and mapping.
.SS Miscellaneous options
.TP 10
.B --no-kalloc
@ -253,6 +267,9 @@ multi-threading mode.
.TP
.B --print-qname
Print query names to stderr, mostly to see which query is crashing minimap2.
.TP
.B --print-seed
Print seed positions to stderr, for debugging only.
.SH OUTPUT FORMAT
.PP
Minimap2 outputs mapping positions in the Pairwise mApping Format (PAF) by