r287: refined CLI options and manpage

This commit is contained in:
Heng Li 2017-08-12 12:26:04 -04:00
parent 0f4c823b0c
commit d240318741
5 changed files with 61 additions and 26 deletions

View File

@ -135,7 +135,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
}
if (opt->flag & MM_F_CDNA)
if (opt->flag & MM_F_SPLICE)
ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, 0, -1, opt->zdrop, flag, ez);
else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
@ -259,7 +259,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw = (int)(opt->bw * 1.5 + 1.);
r2->cnt = 0;
if (!(opt->flag & MM_F_CDNA))
if (!(opt->flag & MM_F_SPLICE))
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
else as1 = r->as, cnt1 = r->cnt;
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
@ -362,7 +362,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
assert(re1 - rs1 <= re0 - rs0);
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
}
kfree(km, tseq);
@ -403,7 +403,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
r_inv->p->dp_score = ez->max;
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0);
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
r_inv->id = -1;
r_inv->parent = MM_PARENT_UNSET;
r_inv->inv = 1;

43
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r286-dirty"
#define MM_VERSION "2.0-r287-dirty"
void liftrlimit()
{
@ -31,6 +31,8 @@ static struct option long_options[] = {
{ "max-chain-skip", required_argument, 0, 0 },
{ "min-dp-len", required_argument, 0, 0 },
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' },
{ "min-chain-score",required_argument, 0, 'm' },
@ -40,10 +42,21 @@ static struct option long_options[] = {
{ 0, 0, 0, 0}
};
static inline int64_t mm_parse_num(const char *str)
{
double x;
char *p;
x = strtod(optarg, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
return (int64_t)(x + .499);
}
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0;
int minibatch_size = 200000000;
uint64_t batch_size = 4000000000ULL;
mm_bseq_file_t *fp = 0;
@ -59,12 +72,12 @@ int main(int argc, char *argv[])
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = atoi(optarg);
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
else if (c == 'f') opt.mid_occ_frac = atof(optarg);
else if (c == 't') n_threads = atoi(optarg);
else if (c == 'v') mm_verbose = atoi(optarg);
else if (c == 'g') opt.max_gap = atoi(optarg);
else if (c == 'G') opt.max_gap_ref = atoi(optarg);
else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg);
else if (c == 'N') opt.best_n = atoi(optarg);
else if (c == 'p') opt.pri_ratio = atof(optarg);
else if (c == 'M') opt.mask_level = atof(optarg);
@ -80,6 +93,8 @@ int main(int argc, char *argv[])
else if (c == 'B') opt.b = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
@ -89,6 +104,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 'V') {
puts(MM_VERSION);
return 0;
@ -98,15 +114,6 @@ int main(int argc, char *argv[])
} else if (c == 'E') {
opt.e = opt.e2 = strtol(optarg, &s, 10);
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
} else if (c == 'I' || c == 'K') {
double x;
char *p;
x = strtod(optarg, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
if (c == 'I') batch_size = (uint64_t)(x + .499);
else minibatch_size = (uint64_t)(x + .499);
} else if (c == 'x') {
if (strcmp(optarg, "ava-ont") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
@ -130,9 +137,9 @@ int main(int argc, char *argv[])
k = 19, w = 19;
opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200;
opt.min_dp_max = 200;
} else if (strcmp(optarg, "cdna") == 0) {
} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
k = 15, w = 5;
opt.flag |= MM_F_CDNA;
opt.flag |= MM_F_SPLICE;
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000;
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
opt.zdrop = 200;
@ -143,6 +150,8 @@ int main(int argc, char *argv[])
}
}
if (w < 0) w = (int)(.6666667 * k + .499);
if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
opt.max_gap_ref = opt.bw = max_intron_len;
if (argc == optind) {
fprintf(stderr, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
@ -163,6 +172,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n");
fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [100k]\n");
fprintf(stderr, " Alignment:\n");
fprintf(stderr, " -A INT matching score [%d]\n", opt.a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b);
@ -187,6 +197,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n");
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
return 1;
}

6
map.c
View File

@ -245,7 +245,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km);
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
*n_regs = n_u;
@ -258,7 +258,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
if (!(opt->flag & MM_F_CDNA))
if (!(opt->flag & MM_F_SPLICE))
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) {
@ -347,7 +347,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
int intron_thres = -1;
for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]);
free(s->buf);
if (p->opt->flag & MM_F_CDNA)
if (p->opt->flag & MM_F_SPLICE)
intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2);
if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init();
for (i = 0; i < s->n_seq; ++i) {

View File

@ -14,7 +14,7 @@
#define MM_F_NO_QUAL 0x10
#define MM_F_OUT_CG 0x20
#define MM_F_OUT_CS 0x40
#define MM_F_CDNA 0x80
#define MM_F_SPLICE 0x80
#define MM_IDX_MAGIC "MMI\2"

View File

@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when
.B -X
is applied.
.TP
.BI -G \ NUM
Maximal intron length in the splice mode. This option also changes the
bandwidth to
.IR NUM .
Increasing this option slows down spliced alignment.
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
for chaining. The time complexity is quadratic in the number of seeds. This
@ -277,13 +283,13 @@ Long assembly to reference mapping
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200
.BR -z200 ).
Up to 10% sequence divergence.
.TP 8
.TP
.B ava-pb
PacBio all-vs-all overlap mapping
.RB ( -Hk19
.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
.BR 25 ).
.TP 8
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
@ -292,6 +298,20 @@ Oxford Nanopore all-vs-all overlap mapping
Similarly, the major difference from
.B ava-pb
is that this preset is not using HPC minimizers.
.TP
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0
.BR -z200 ).
As of now, minimap2 only finds approximate exon boundaries. The true boundaries
are usually within 10bp around the reported positions. In the splice mode,
1) long deletions are taken as introns and represented as the
.RB ` N '
CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
costs are different during chaining; 4) the computation of the
.RB ` ms '
tag ignores introns to demote hits to pseudogenes.
.RE
.SS Miscellaneous options
.TP 10
@ -367,6 +387,10 @@ Minimap2 does not work well with Illumina short reads as of now.
*
Minimap2 requires SSE2 instructions to compile. It is possible to add
non-SSE2 support, but it would make minimap2 slower by several times.
.TP
*
In the splice mode, minimap2 is unable to find the precise exon boundaries.
The true bounraries are usually within 10bp around the reported locations.
.SH SEE ALSO
.PP
miniasm(1), minimap(1), bwa(1).