From d24031874136b83b5dd30ea3a4375ee4a74c7a28 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 12:26:04 -0400 Subject: [PATCH] r287: refined CLI options and manpage --- align.c | 8 ++++---- main.c | 43 +++++++++++++++++++++++++++---------------- map.c | 6 +++--- minimap.h | 2 +- minimap2.1 | 28 ++++++++++++++++++++++++++-- 5 files changed, 61 insertions(+), 26 deletions(-) diff --git a/align.c b/align.c index f96d4a9..a348e59 100644 --- a/align.c +++ b/align.c @@ -135,7 +135,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr); for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } - if (opt->flag & MM_F_CDNA) + if (opt->flag & MM_F_SPLICE) ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, 0, -1, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); @@ -259,7 +259,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw = (int)(opt->bw * 1.5 + 1.); r2->cnt = 0; - if (!(opt->flag & MM_F_CDNA)) + if (!(opt->flag & MM_F_SPLICE)) mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); else as1 = r->as, cnt1 = r->cnt; mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); @@ -362,7 +362,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); } kfree(km, tseq); @@ -403,7 +403,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/main.c b/main.c index ecf2f7c..c35a1bd 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r286-dirty" +#define MM_VERSION "2.0-r287-dirty" void liftrlimit() { @@ -31,6 +31,8 @@ static struct option long_options[] = { { "max-chain-skip", required_argument, 0, 0 }, { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, + { "splice", no_argument, 0, 0 }, + { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, { "min-chain-score",required_argument, 0, 'm' }, @@ -40,10 +42,21 @@ static struct option long_options[] = { { 0, 0, 0, 0} }; +static inline int64_t mm_parse_num(const char *str) +{ + double x; + char *p; + x = strtod(optarg, &p); + if (*p == 'G' || *p == 'g') x *= 1e9; + else if (*p == 'M' || *p == 'm') x *= 1e6; + else if (*p == 'K' || *p == 'k') x *= 1e3; + return (int64_t)(x + .499); +} + int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0; + int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0; int minibatch_size = 200000000; uint64_t batch_size = 4000000000ULL; mm_bseq_file_t *fp = 0; @@ -59,12 +72,12 @@ int main(int argc, char *argv[]) else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I - else if (c == 'r') opt.bw = atoi(optarg); + else if (c == 'r') opt.bw = (int)mm_parse_num(optarg); else if (c == 'f') opt.mid_occ_frac = atof(optarg); else if (c == 't') n_threads = atoi(optarg); else if (c == 'v') mm_verbose = atoi(optarg); - else if (c == 'g') opt.max_gap = atoi(optarg); - else if (c == 'G') opt.max_gap_ref = atoi(optarg); + else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg); + else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg); else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); @@ -80,6 +93,8 @@ int main(int argc, char *argv[]) else if (c == 'B') opt.b = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); + else if (c == 'I') batch_size = mm_parse_num(optarg); + else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc @@ -89,6 +104,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq + else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice else if (c == 'V') { puts(MM_VERSION); return 0; @@ -98,15 +114,6 @@ int main(int argc, char *argv[]) } else if (c == 'E') { opt.e = opt.e2 = strtol(optarg, &s, 10); if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); - } else if (c == 'I' || c == 'K') { - double x; - char *p; - x = strtod(optarg, &p); - if (*p == 'G' || *p == 'g') x *= 1e9; - else if (*p == 'M' || *p == 'm') x *= 1e6; - else if (*p == 'K' || *p == 'k') x *= 1e3; - if (c == 'I') batch_size = (uint64_t)(x + .499); - else minibatch_size = (uint64_t)(x + .499); } else if (c == 'x') { if (strcmp(optarg, "ava-ont") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; @@ -130,9 +137,9 @@ int main(int argc, char *argv[]) k = 19, w = 19; opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200; opt.min_dp_max = 200; - } else if (strcmp(optarg, "cdna") == 0) { + } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { k = 15, w = 5; - opt.flag |= MM_F_CDNA; + opt.flag |= MM_F_SPLICE; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.zdrop = 200; @@ -143,6 +150,8 @@ int main(int argc, char *argv[]) } } if (w < 0) w = (int)(.6666667 * k + .499); + if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0) + opt.max_gap_ref = opt.bw = max_intron_len; if (argc == optind) { fprintf(stderr, "Usage: minimap2 [options] | [query.fa] [...]\n"); @@ -163,6 +172,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n"); fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); + fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [100k]\n"); fprintf(stderr, " Alignment:\n"); fprintf(stderr, " -A INT matching score [%d]\n", opt.a); fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); @@ -187,6 +197,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); + fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); return 1; } diff --git a/map.c b/map.c index df13811..f4ea657 100644 --- a/map.c +++ b/map.c @@ -245,7 +245,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap; - n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km); + n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km); regs = mm_gen_regs(b->km, qlen, n_u, u, a); *n_regs = n_u; @@ -258,7 +258,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap mm_set_parent(b->km, opt->mask_level, *n_regs, regs); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); - if (!(opt->flag & MM_F_CDNA)) + if (!(opt->flag & MM_F_SPLICE)) mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) { @@ -347,7 +347,7 @@ static void *worker_pipeline(void *shared, int step, void *in) int intron_thres = -1; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); - if (p->opt->flag & MM_F_CDNA) + if (p->opt->flag & MM_F_SPLICE) intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { diff --git a/minimap.h b/minimap.h index dbd2e3e..6e0569b 100644 --- a/minimap.h +++ b/minimap.h @@ -14,7 +14,7 @@ #define MM_F_NO_QUAL 0x10 #define MM_F_OUT_CG 0x20 #define MM_F_OUT_CS 0x40 -#define MM_F_CDNA 0x80 +#define MM_F_SPLICE 0x80 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 2c85f46..b9113f1 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when .B -X is applied. .TP +.BI -G \ NUM +Maximal intron length in the splice mode. This option also changes the +bandwidth to +.IR NUM . +Increasing this option slows down spliced alignment. +.TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This @@ -277,13 +283,13 @@ Long assembly to reference mapping .B -w19 -A1 -B9 -O16,41 -E2,1 -s200 .BR -z200 ). Up to 10% sequence divergence. -.TP 8 +.TP .B ava-pb PacBio all-vs-all overlap mapping .RB ( -Hk19 .B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip .BR 25 ). -.TP 8 +.TP .B ava-ont Oxford Nanopore all-vs-all overlap mapping .RB ( -k15 @@ -292,6 +298,20 @@ Oxford Nanopore all-vs-all overlap mapping Similarly, the major difference from .B ava-pb is that this preset is not using HPC minimizers. +.TP +.B splice +Long-read spliced alignment +.RB ( -k15 +.B -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 +.BR -z200 ). +As of now, minimap2 only finds approximate exon boundaries. The true boundaries +are usually within 10bp around the reported positions. In the splice mode, +1) long deletions are taken as introns and represented as the +.RB ` N ' +CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap +costs are different during chaining; 4) the computation of the +.RB ` ms ' +tag ignores introns to demote hits to pseudogenes. .RE .SS Miscellaneous options .TP 10 @@ -367,6 +387,10 @@ Minimap2 does not work well with Illumina short reads as of now. * Minimap2 requires SSE2 instructions to compile. It is possible to add non-SSE2 support, but it would make minimap2 slower by several times. +.TP +* +In the splice mode, minimap2 is unable to find the precise exon boundaries. +The true bounraries are usually within 10bp around the reported locations. .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1).