From 4badf2fcbf4e1fa8906bd2b28350b54371223fe7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 11:29:46 -0400 Subject: [PATCH 01/39] avoid wasted memory allocation for backtrack --- ksw2_extd2_sse.c | 5 ++++- ksw2_extz2_sse.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index e0f5337..c7f5843 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -1,5 +1,6 @@ #include #include +#include #include "ksw2.h" #ifdef __SSE2__ @@ -66,7 +67,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (w < 0) w = tlen > qlen? tlen : qlen; wl = wr = w; tlen_ = (tlen + 15) / 16; - n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; @@ -161,6 +163,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin x21_ = _mm_cvtsi32_si128((uint8_t)x21); v1_ = _mm_cvtsi32_si128((uint8_t)v1); st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); if (!with_cigar) { // score only for (t = st_; t <= en_; ++t) { __m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index d665c6f..04669b9 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -1,4 +1,5 @@ #include +#include #include "ksw2.h" #ifdef __SSE2__ @@ -58,7 +59,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (w < 0) w = tlen > qlen? tlen : qlen; wl = wr = w; tlen_ = (tlen + 15) / 16; - n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; @@ -130,6 +132,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin x1_ = _mm_cvtsi32_si128(x1); v1_ = _mm_cvtsi32_si128(v1); st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); if (!with_cigar) { // score only for (t = st_; t <= en_; ++t) { __m128i z, a, b, xt1, vt1, ut, tmp; From 1a7d782131864819f6e66a2b7eea91e9d6cd737b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 11:31:49 -0400 Subject: [PATCH 02/39] r273: cdna mapping mode for testing Differences from the typical mapping mode: * banded alignment disabled * log gap cost during chaining * zero long-gap extension during alignment * up to 100kb (by default) reference gap * bad seeding not filtered (to tune later) --- align.c | 13 ++++++++----- chain.c | 13 +++++++++---- main.c | 8 +++++++- map.c | 9 ++++++--- minimap.h | 3 ++- mmpriv.h | 2 +- 6 files changed, 33 insertions(+), 15 deletions(-) diff --git a/align.c b/align.c index 595f0b1..b57edcb 100644 --- a/align.c +++ b/align.c @@ -126,16 +126,17 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez) { + int bw = (opt->flag & MM_F_CDNA)? -1 : w; if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { int i; - fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop); + fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, bw, flag, opt->zdrop); for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr); for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->q == opt->q2 && opt->e == opt->e2) - ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); + ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, flag, ez); else - ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez); + ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, bw, opt->zdrop, flag, ez); } static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x) @@ -254,8 +255,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw = (int)(opt->bw * 1.5 + 1.); r2->cnt = 0; - mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); - mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); + if (!(opt->flag & MM_F_CDNA)) { + mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); + mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); + } else as1 = r->as, cnt1 = r->cnt; mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); diff --git a/chain.c b/chain.c index 1c72631..8894d44 100644 --- a/chain.c +++ b/chain.c @@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v) return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v]; } -int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km) +int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km) { // TODO: make sure this works when n has more than 32 bits int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v; int64_t i, j; @@ -42,17 +42,22 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int uint64_t ri = a[i].x; int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!! int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX; - while (st < i && ri - a[st].x > max_dist) ++st; + while (st < i && ri - a[st].x > max_dist_x) ++st; for (j = i - 1; j >= st; --j) { int64_t dr = ri - a[j].x; int32_t dq = qi - (int32_t)a[j].y, dd, sc; - if (dr == 0 || dq <= 0 || dq > max_dist) continue; + if (dr == 0 || dq <= 0 || dq > max_dist_y) continue; dd = dr > dq? dr - dq : dq - dr; if (dd > bw) continue; max_f_past = max_f_past > f[j]? max_f_past : f[j]; min_d = dq < dr? dq : dr; sc = min_d > q_span? q_span : dq < dr? dq : dr; - sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1); + if (is_cdna) { + int c_log, c_lin; + c_lin = (int)(dd * .01 * avg_qspan); + c_log = ilog2_32(dd); + sc -= c_lin < c_log? c_lin : c_log; + } else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1); sc += f[j]; if (sc > max_f) { max_f = sc, max_j = j; diff --git a/main.c b/main.c index 7244265..680d35f 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r238-dirty" +#define MM_VERSION "2.0rc1-r273-dirty" void liftrlimit() { @@ -129,6 +129,12 @@ int main(int argc, char *argv[]) k = 19, w = 19; opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200; opt.min_dp_max = 200; + } else if (strcmp(optarg, "cdna") == 0) { + k = 15, w = 5; + opt.flag |= MM_F_CDNA; + opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; + opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 70, opt.e2 = 0; + opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); return 1; diff --git a/map.c b/map.c index 4ea111e..6e7b869 100644 --- a/map.c +++ b/map.c @@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_chain_score = 40; opt->bw = 500; opt->max_gap = 5000; + opt->max_gap_ref = -1; opt->max_chain_skip = 25; opt->mask_level = 0.5f; @@ -167,7 +168,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2) #endif mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs) { - int i, n = m_en - m_st, j, n_u; + int i, n = m_en - m_st, j, n_u, max_gap_ref; int64_t n_a; uint64_t *u; mm_match_t *m; @@ -243,7 +244,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff), i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); - n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km); + max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap; + n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km); regs = mm_gen_regs(b->km, qlen, n_u, u, a); *n_regs = n_u; @@ -256,7 +258,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap mm_set_parent(b->km, opt->mask_level, *n_regs, regs); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); - mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle + if (!(opt->flag & MM_F_CDNA)) + mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) { regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() diff --git a/minimap.h b/minimap.h index 076017f..dbd2e3e 100644 --- a/minimap.h +++ b/minimap.h @@ -14,6 +14,7 @@ #define MM_F_NO_QUAL 0x10 #define MM_F_OUT_CG 0x20 #define MM_F_OUT_CS 0x40 +#define MM_F_CDNA 0x80 #define MM_IDX_MAGIC "MMI\2" @@ -80,7 +81,7 @@ typedef struct { int flag; // see MM_F_* macros int bw; // bandwidth - int max_gap; // break a chain if there are no minimizers in a max_gap window + int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window int max_chain_skip; int min_cnt; int min_chain_score; diff --git a/mmpriv.h b/mmpriv.h index 32b3fd1..1c03ae7 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -42,7 +42,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); -int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); +int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a); From 7f9f659b6aa0e8dc1030501aec463a676cfb7dd0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 11:39:23 -0400 Subject: [PATCH 03/39] r274: CLI option to change max ref gap --- main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.c b/main.c index 680d35f..4e1f897 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r273-dirty" +#define MM_VERSION "2.0rc1-r274-dirty" void liftrlimit() { @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -64,6 +64,7 @@ int main(int argc, char *argv[]) else if (c == 't') n_threads = atoi(optarg); else if (c == 'v') mm_verbose = atoi(optarg); else if (c == 'g') opt.max_gap = atoi(optarg); + else if (c == 'G') opt.max_gap_ref = atoi(optarg); else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); From 6840370f3c00fd677cf3af04483463ee0945af2b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 21:16:25 -0400 Subject: [PATCH 04/39] Release minimap2-2.0 (r275) --- NEWS.md | 13 +++++++++++++ main.c | 2 +- minimap2.1 | 42 ++++++++++++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/NEWS.md b/NEWS.md index eb0c522..b8bdd62 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +Release 2.0-r275 (8 August 2017) +-------------------------------- + +This release is identical to version 2.0rc1, except the version number. It is +described and evaluated in the following technical report: + + * Li, H. (2017). Minimap2: fast pairwise alignment for long DNA sequences. + [arXiv:1708.01492v1](https://arxiv.org/abs/1708.01492v1). + +(2.0: 8 August 2017, r275) + + + Release 2.0rc1-r232 (30 July 2017) ---------------------------------- diff --git a/main.c b/main.c index c8650d0..9cc467d 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r232" +#define MM_VERSION "2.0-r275" void liftrlimit() { diff --git a/minimap2.1 b/minimap2.1 index 0e63a11..2c85f46 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "30 July 2017" "minimap2-2.0rc1-r232" "Bioinformatics tools" +.TH minimap2 1 "8 August 2017" "minimap2-2.0-r275" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -247,35 +247,49 @@ are: .RS .TP 8 .B map-pb -PacBio/Oxford Nanopore read to reference mapping (-Hk19) +PacBio/Oxford Nanopore read to reference mapping +.RB ( -Hk19 ) .TP .B map10k The same as .B map-pb -(-Hk19) +.RB ( -Hk19 ) .TP .B map-ont -Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For -PacBio reads, HPC minimizers consistently leads to faster performance and more -sensitive results in comparison to normal minimizers. For Oxford Nanopore data, -normal minimizers are better, though not much. The effectiveness of HPC is -determined by the sequencing error mode. +Slightly more sensitive for Oxford Nanopore to reference mapping +.RB ( -k15 ). +For PacBio reads, HPC minimizers consistently leads to faster performance and +more sensitive results in comparison to normal minimizers. For Oxford Nanopore +data, normal minimizers are better, though not much. The effectiveness of HPC +is determined by the sequencing error mode. .TP .B asm5 -Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200). +Long assembly to reference mapping +.RB ( -k19 +.B -w19 -A1 -B19 -O39,81 -E3,1 -s200 +.BR -z200 ). Typically, the alignment will not extend to regions with 5% or higher sequence divergence. Only use this preset if the average divergence is far below 5%. .TP .B asm10 -Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200). Up -to 10% sequence divergence. +Long assembly to reference mapping +.RB ( -k19 +.B -w19 -A1 -B9 -O16,41 -E2,1 -s200 +.BR -z200 ). +Up to 10% sequence divergence. .TP 8 .B ava-pb -PacBio all-vs-all overlap mapping (-Hk19 -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip 25) +PacBio all-vs-all overlap mapping +.RB ( -Hk19 +.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip +.BR 25 ). .TP 8 .B ava-ont -Oxford Nanopore all-vs-all overlap mapping (-k15 -w5 -Xp0 -m100 -K500m -g10000 ---max-chain-skip 25). Similarly, the major difference from +Oxford Nanopore all-vs-all overlap mapping +.RB ( -k15 +.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip +.BR 25 ). +Similarly, the major difference from .B ava-pb is that this preset is not using HPC minimizers. .RE From 4db0d1034bc9892e4b3c5553af57a16331ac3061 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 21:32:56 -0400 Subject: [PATCH 05/39] Show travis CI status --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 482ca9d..21f69f0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2) ## Getting Started ```sh git clone https://github.com/lh3/minimap2 From 9e1125eddaff22cbb72e63e695b6835f1e013d67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Aug 2017 21:46:15 -0400 Subject: [PATCH 06/39] r277: abort if query/-d missing (#11) --- main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index 54c49d8..f80d642 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r276-dirty" +#define MM_VERSION "2.0-r277-dirty" void liftrlimit() { @@ -189,6 +189,10 @@ int main(int argc, char *argv[]) fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]); return 1; } + if (!is_idx && fnw == 0 && argc - optind < 2) { + fprintf(stderr, "[E::%s] missing input: please specify a query file or option -d\n", __func__); + return 1; + } if (is_idx) fpr = fopen(argv[optind], "rb"); else fp = mm_bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); From c59b0781bc71114ceb6071e1c303c855d574d153 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 9 Aug 2017 11:45:02 -0400 Subject: [PATCH 07/39] r280: output introns as "N" in the cdna mode --- chain.c | 3 ++- format.c | 20 ++++++++++++++------ main.c | 2 +- map.c | 11 ++++++++--- mmpriv.h | 4 ++-- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/chain.c b/chain.c index 8894d44..37c920a 100644 --- a/chain.c +++ b/chain.c @@ -56,7 +56,8 @@ int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cn int c_log, c_lin; c_lin = (int)(dd * .01 * avg_qspan); c_log = ilog2_32(dd); - sc -= c_lin < c_log? c_lin : c_log; + if (dr > dq) sc -= c_lin < c_log? c_lin : c_log; + else sc -= c_lin + (c_log>>1); } else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1); sc += f[j]; if (sc > max_f) { diff --git a/format.c b/format.c index fd51c00..02cf3be 100644 --- a/format.c +++ b/format.c @@ -122,7 +122,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -136,8 +136,12 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); - for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); + for (k = 0; k < r->p->n_cigar; ++k) { + int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (intron_thres > 0 && op == 2 && len >= intron_thres) + mm_sprintf_lite(s, "%dN", len); + else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); + } } if (r->p && (opt_flag & MM_F_OUT_CS)) write_cs(km, s, mi, t, r); @@ -167,7 +171,7 @@ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) } else str_copy(s, seq, seq + l); } -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs) +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres) { int flag = 0; s->l = 0; @@ -186,8 +190,12 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs; int clip_char = (flag&0x800)? 'H' : 'S'; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); - for (k = 0; k < r->p->n_cigar; ++k) - mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); + for (k = 0; k < r->p->n_cigar; ++k) { + int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; + if (intron_thres > 0 && op == 2 && len >= intron_thres) + mm_sprintf_lite(s, "%dN", len); + else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); + } clip_len = r->rev? r->qs : t->l_seq - r->qe; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); } else mm_sprintf_lite(s, "*"); diff --git a/main.c b/main.c index 5ac5262..32a2bc7 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r279-dirty" +#define MM_VERSION "2.0-r280-dirty" void liftrlimit() { diff --git a/map.c b/map.c index 6e7b869..dd1624c 100644 --- a/map.c +++ b/map.c @@ -344,19 +344,24 @@ static void *worker_pipeline(void *shared, int step, void *in) void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; + int intron_thres = -1; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); + if (p->opt->flag & MM_F_CDNA) + intron_thres = (int)((float)(p->opt->q2 - p->opt->q) / p->opt->e + 0.999f); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; - if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); - else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); + if (p->opt->flag & MM_F_OUT_SAM) + mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i], intron_thres); + else + mm_write_paf(&p->str, mi, t, r, km, p->opt->flag, intron_thres); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { - mm_write_sam(&p->str, 0, t, 0, 0, 0); + mm_write_sam(&p->str, 0, t, 0, 0, 0, 0); puts(p->str.s); } for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p); diff --git a/mmpriv.h b/mmpriv.h index 1c03ae7..6f22b0b 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,8 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); From 163fa36ee6300d9dbfde3b9571c8d5ec00c4945b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Aug 2017 15:04:59 -0400 Subject: [PATCH 08/39] r281: don't open long gaps on query --- Makefile | 9 ++++++--- align.c | 17 +++++++++-------- ksw2.h | 3 +++ ksw2_extd2_sse.c | 21 +++++++++++++++++++++ main.c | 4 ++-- 5 files changed, 41 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 31eb857..839636e 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= -DHAVE_KALLOC INCLUDES= -I. -OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_ll_sse.o misc.o bseq.o \ - sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o +OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_extd2_noins_sse.o ksw2_ll_sse.o \ + misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread @@ -33,6 +33,9 @@ libminimap2.a:$(OBJS) sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz +ksw2_extd2_noins_sse.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_NO_LONG_INS $(INCLUDES) $< -o $@ + clean: rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session* @@ -45,7 +48,7 @@ align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h bseq.o: bseq.h kseq.h chain.o: minimap.h mmpriv.h bseq.h kalloc.h example.o: minimap.h kseq.h -format.o: mmpriv.h minimap.h bseq.h +format.o: kalloc.h mmpriv.h minimap.h bseq.h hit.o: mmpriv.h minimap.h bseq.h kalloc.h index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h kalloc.o: kalloc.h diff --git a/align.c b/align.c index b57edcb..b9aa5bf 100644 --- a/align.c +++ b/align.c @@ -126,17 +126,18 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez) { - int bw = (opt->flag & MM_F_CDNA)? -1 : w; if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) { int i; - fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, bw, flag, opt->zdrop); + fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop); for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr); for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } - if (opt->q == opt->q2 && opt->e == opt->e2) - ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, flag, ez); + if (opt->flag & MM_F_CDNA) + ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, -1, opt->zdrop, flag, ez); + else if (opt->q == opt->q2 && opt->e == opt->e2) + ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else - ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, bw, opt->zdrop, flag, ez); + ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez); } static inline int mm_get_hplen_back(const mm_idx_t *mi, uint32_t rid, uint32_t x) @@ -255,10 +256,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw = (int)(opt->bw * 1.5 + 1.); r2->cnt = 0; - if (!(opt->flag & MM_F_CDNA)) { + if (!(opt->flag & MM_F_CDNA)) mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); - mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); - } else as1 = r->as, cnt1 = r->cnt; + else as1 = r->as, cnt1 = r->cnt; + mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); diff --git a/ksw2.h b/ksw2.h index b7774a8..5f885eb 100644 --- a/ksw2.h +++ b/ksw2.h @@ -53,6 +53,9 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); +void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); + void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); /** diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index c7f5843..532159b 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -10,8 +10,13 @@ #include #endif +#if !defined(KSW_NO_LONG_INS) void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +#else +void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +#endif { #define __dp_code_block1 \ z = _mm_load_si128(&s[t]); \ @@ -172,7 +177,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, a); z = _mm_max_epi8(z, b); z = _mm_max_epi8(z, a2); + #ifndef KSW_NO_LONG_INS z = _mm_max_epi8(z, b2); + #endif z = _mm_min_epi8(z, sc_mch_); __dp_code_block2; // save u[] and v[]; update a, b, a2 and b2 _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); @@ -197,8 +204,10 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); tmp = _mm_cmpgt_epi8(a2, zero_); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); + #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, zero_); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); + #endif #endif } } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment @@ -214,8 +223,10 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, b); d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d z = _mm_max_epi8(z, a2); + #ifndef KSW_NO_LONG_INS d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d z = _mm_max_epi8(z, b2); + #endif z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(a, z); @@ -227,9 +238,11 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(a2, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); + #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); + #endif tmp = _mm_cmplt_epi8(sc_mch_, z); z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif @@ -243,9 +256,11 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(a2, zero_); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, zero_); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 + #endif _mm_store_si128(&pr[t], d); } } else { // gap right-alignment @@ -261,8 +276,10 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin z = _mm_max_epi8(z, b); d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 z = _mm_max_epi8(z, a2); + #ifndef KSW_NO_LONG_INS d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 z = _mm_max_epi8(z, b2); + #endif z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(z, a); @@ -274,9 +291,11 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(z, a2); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); + #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(z, b2); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2)); + #endif tmp = _mm_cmplt_epi8(sc_mch_, z); z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif @@ -290,9 +309,11 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(zero_, a2); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), qe2_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(zero_, b2); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b2), qe2_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 + #endif _mm_store_si128(&pr[t], d); } } diff --git a/main.c b/main.c index 32a2bc7..10f38ed 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r280-dirty" +#define MM_VERSION "2.0-r281-dirty" void liftrlimit() { @@ -134,7 +134,7 @@ int main(int argc, char *argv[]) k = 15, w = 5; opt.flag |= MM_F_CDNA; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; - opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 70, opt.e2 = 0; + opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 42, opt.e2 = 0; opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); From a99358bc3da62c2bf9b027e61f7c3debad8b89e3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 11 Aug 2017 00:06:01 -0400 Subject: [PATCH 09/39] r282: reduced intron cost; added eval script --- main.c | 4 +- misc/exon-eval.js | 235 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 misc/exon-eval.js diff --git a/main.c b/main.c index 10f38ed..17d3e29 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r281-dirty" +#define MM_VERSION "2.0-r282-dirty" void liftrlimit() { @@ -134,7 +134,7 @@ int main(int argc, char *argv[]) k = 15, w = 5; opt.flag |= MM_F_CDNA; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; - opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 42, opt.e2 = 0; + opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); diff --git a/misc/exon-eval.js b/misc/exon-eval.js new file mode 100644 index 0000000..4947151 --- /dev/null +++ b/misc/exon-eval.js @@ -0,0 +1,235 @@ +/******************************* + * Command line option parsing * + *******************************/ + +var getopt = function(args, ostr) { + var oli; // option letter list index + if (typeof(getopt.place) == 'undefined') + getopt.ind = 0, getopt.arg = null, getopt.place = -1; + if (getopt.place == -1) { // update scanning pointer + if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { + getopt.place = -1; + return null; + } + if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" + ++getopt.ind; + getopt.place = -1; + return null; + } + } + var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity + if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { + if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. + if (getopt.place < 0) ++getopt.ind; + return '?'; + } + if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument + getopt.arg = null; + if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; + } else { // need an argument + if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) + getopt.arg = args[getopt.ind].substr(getopt.place); + else if (args.length <= ++getopt.ind) { // no arg + getopt.place = -1; + if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; + return '?'; + } else getopt.arg = args[getopt.ind]; // white space + getopt.place = -1; + ++getopt.ind; + } + return optopt; +} + +/*********************** + * Interval operations * + ***********************/ + +Interval = {}; + +Interval.sort = function(a) +{ + if (typeof a[0] == 'number') + a.sort(function(x, y) { return x - y }); + else a.sort(function(x, y) { return x[0] - y[0] }); +} + +Interval.merge = function(a, sorted) +{ + if (typeof sorted == 'undefined') sorted = true; + if (!sorted) Interval.sort(a); + var k = 0; + for (var i = 1; i < a.length; ++i) { + if (a[k][1] >= a[i][0]) + a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1]; + else a[++k] = a[i].slice(0); + } + a.length = k + 1; +} + +Interval.index_end = function(a, sorted) +{ + if (a.length == 0) return; + if (typeof sorted == 'undefined') sorted = true; + if (!sorted) Interval.sort(a); + a[0].push(0); + var k = 0, k_en = a[0][1]; + for (var i = 1; i < a.length; ++i) { + if (k_en <= a[i][0]) { + for (++k; k < i; ++k) + if (a[k][1] > a[i][0]) + break; + k_en = a[k][1]; + } + a[i].push(k); + } +} + +Interval.find_intv = function(a, x) +{ + var left = -1, right = a.length; + if (typeof a[0] == 'number') { + while (right - left > 1) { + var mid = left + ((right - left) >> 1); + if (a[mid] > x) right = mid; + else if (a[mid] < x) left = mid; + else return mid; + } + } else { + while (right - left > 1) { + var mid = left + ((right - left) >> 1); + if (a[mid][0] > x) right = mid; + else if (a[mid][0] < x) left = mid; + else return mid; + } + } + return left; +} + +Interval.find_ovlp = function(a, st, en) +{ + if (a.length == 0 || st >= en) return []; + var l = Interval.find_intv(a, st); + var k = l < 0? 0 : a[l][a[l].length - 1]; + var b = []; + for (var i = k; i < a.length; ++i) { + if (a[i][0] >= en) break; + else if (st < a[i][1]) + b.push(a[i]); + } + return b; +} + +/***************** + * Main function * + *****************/ + +var c, l_fuzzy = 10, min_ov_ratio = 0.95; +while ((c = getopt(arguments, "l:r:")) != null) { + if (c == 'l') l_fuzzy = parseInt(getopt.arg); + else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); +} + +if (arguments.length - getopt.ind < 2) { + print("Usage: k8 cdna-eval.js [options] "); + exit(1); +} + +var file, buf = new Bytes(); + +var anno = {}; +file = new File(arguments[getopt.ind]); +while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '#') continue; + if (t[2] != 'exon') continue; + var st = parseInt(t[3]) - 1; + var en = parseInt(t[4]); + if (anno[t[0]] == null) anno[t[0]] = []; + anno[t[0]].push([st, en]); +} +file.close(); + +for (var chr in anno) { + var e = anno[chr]; + Interval.sort(e); + var k = 0; + for (var i = 1; i < e.length; ++i) // dedup + if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) + e[k++] = e[i]; + e.length = k; + Interval.index_end(e); +} + +var n_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0; +var n_ext_hit = 0, n_int_hit = 0, n_sgl_hit = 0; + +file = new File(arguments[getopt.ind+1]); +var last_qname = null; +var re_cigar = /(\d+)([MIDNSH])/g; +while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '@') continue; + if (last_qname == t[0]) continue; + if (t[2] == '*') { + ++n_unmapped; + continue; + } else ++n_mapped; + var st = parseInt(t[3]) - 1, en = st, exon_st = st; + var exon = []; + while ((m = re_cigar.exec(t[5])) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + exon.push([exon_st, en]); + en += len; + exon_st = en; + } else if (op == 'M' || op == 'D') en += len; + } + exon.push([exon_st, en]); + n_exon += exon.length; + var chr = anno[t[2]]; + if (chr == null) { + n_novel += exon.length; + } else { + for (var i = 0; i < exon.length; ++i) { + var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]); + if (o.length > 0) { + var hit = false; + for (var j = 0; j < o.length; ++j) { + var st_diff = exon[i][0] - o[j][0]; + var en_diff = exon[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; + var max_st = exon[i][0] > o[j][0]? exon[i][0] : o[j][0]; + var min_en = exon[i][1] < o[j][1]? exon[i][1] : o[j][1]; + var ol = min_en - max_st; + var l0 = exon[i][1] - exon[i][0]; + var l1 = o[j][1] - o[j][0]; + var min = l0 < l1? l0 : l1; + var ov_ratio = ol / min; + if (ov_ratio >= min_ov_ratio) { + if (i == 0 && exon.length == 1) { + ++n_sgl_hit, hit = true; + } else if (i == 0) { + if (en_diff <= l_fuzzy) ++n_ext_hit, hit = true; + } else if (i == exon.length - 1) { + if (st_diff <= l_fuzzy) ++n_ext_hit, hit = true; + } else { + if (en_diff <= l_fuzzy && st_diff <= l_fuzzy) ++n_int_hit, hit = true; + } + } + if (hit) break; + } + } else ++n_novel; + } + } + last_qname = t[0]; +} +file.close(); + +buf.destroy(); + +print("Number of unmapped reads: " + n_unmapped); +print("Number of mapped reads: " + n_mapped); +print("Number of mapped exons: " + n_exon); +print("Number of novel exons: " + n_novel); +print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); From 0c1760bc86335f083ee139c9a2447f9b345873ab Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 11 Aug 2017 16:28:52 -0400 Subject: [PATCH 10/39] bugfix; print errors; better rules --- misc/exon-eval.js | 48 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/misc/exon-eval.js b/misc/exon-eval.js index 4947151..5127522 100644 --- a/misc/exon-eval.js +++ b/misc/exon-eval.js @@ -50,7 +50,7 @@ Interval.sort = function(a) { if (typeof a[0] == 'number') a.sort(function(x, y) { return x - y }); - else a.sort(function(x, y) { return x[0] - y[0] }); + else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] }); } Interval.merge = function(a, sorted) @@ -123,10 +123,11 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 10, min_ov_ratio = 0.95; -while ((c = getopt(arguments, "l:r:")) != null) { +var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_err = false; +while ((c = getopt(arguments, "l:r:e")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); + else if (c == 'e') print_err = true; } if (arguments.length - getopt.ind < 2) { @@ -155,8 +156,8 @@ for (var chr in anno) { var k = 0; for (var i = 1; i < e.length; ++i) // dedup if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) - e[k++] = e[i]; - e.length = k; + e[++k] = e[i].slice(0); + e.length = k + 1; Interval.index_end(e); } @@ -195,18 +196,20 @@ while (file.readline(buf) >= 0) { if (o.length > 0) { var hit = false; for (var j = 0; j < o.length; ++j) { - var st_diff = exon[i][0] - o[j][0]; - var en_diff = exon[i][1] - o[j][1]; - if (st_diff < 0) st_diff = -st_diff; - if (en_diff < 0) en_diff = -en_diff; + var min_st = exon[i][0] < o[j][0]? exon[i][0] : o[j][0]; var max_st = exon[i][0] > o[j][0]? exon[i][0] : o[j][0]; var min_en = exon[i][1] < o[j][1]? exon[i][1] : o[j][1]; - var ol = min_en - max_st; + var max_en = exon[i][1] > o[j][1]? exon[i][1] : o[j][1]; + var ol = min_en - max_st, span = max_en - min_st; var l0 = exon[i][1] - exon[i][0]; var l1 = o[j][1] - o[j][0]; var min = l0 < l1? l0 : l1; var ov_ratio = ol / min; if (ov_ratio >= min_ov_ratio) { + var st_diff = exon[i][0] - o[j][0]; + var en_diff = exon[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; if (i == 0 && exon.length == 1) { ++n_sgl_hit, hit = true; } else if (i == 0) { @@ -214,11 +217,22 @@ while (file.readline(buf) >= 0) { } else if (i == exon.length - 1) { if (st_diff <= l_fuzzy) ++n_ext_hit, hit = true; } else { - if (en_diff <= l_fuzzy && st_diff <= l_fuzzy) ++n_int_hit, hit = true; + //if (en_diff <= l_fuzzy && st_diff <= l_fuzzy && ol / span >= min_ov_ratio) + if (en_diff + st_diff <= l_fuzzy || ol / span >= min_ov_ratio) + ++n_int_hit, hit = true; } } if (hit) break; } + if (!hit && print_err) { + var x = '['; + for (var j = 0; j < o.length; ++j) { + if (j) x += ', '; + x += '(' + o[j][0] + "," + o[j][1] + ')'; + } + x += ']'; + print(t[0], i+1, exon[i][0], exon[i][1], x); + } } else ++n_novel; } } @@ -228,8 +242,10 @@ file.close(); buf.destroy(); -print("Number of unmapped reads: " + n_unmapped); -print("Number of mapped reads: " + n_mapped); -print("Number of mapped exons: " + n_exon); -print("Number of novel exons: " + n_novel); -print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); +if (!print_err) { + print("Number of unmapped reads: " + n_unmapped); + print("Number of mapped reads: " + n_mapped); + print("Number of mapped exons: " + n_exon); + print("Number of novel exons: " + n_novel); + print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); +} From 30b8cb46728bd7696f056ae8f89766e9cbd3fbb2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 11 Aug 2017 17:50:21 -0400 Subject: [PATCH 11/39] option to print correct exons --- misc/exon-eval.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/misc/exon-eval.js b/misc/exon-eval.js index 5127522..8ba14b7 100644 --- a/misc/exon-eval.js +++ b/misc/exon-eval.js @@ -123,11 +123,12 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_err = false; -while ((c = getopt(arguments, "l:r:e")) != null) { +var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_err = false, print_corr = false; +while ((c = getopt(arguments, "l:r:ec")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); else if (c == 'e') print_err = true; + else if (c == 'c') print_corr = true; } if (arguments.length - getopt.ind < 2) { @@ -224,14 +225,14 @@ while (file.readline(buf) >= 0) { } if (hit) break; } - if (!hit && print_err) { + if ((print_err && !hit) || (print_corr && hit)) { var x = '['; for (var j = 0; j < o.length; ++j) { if (j) x += ', '; x += '(' + o[j][0] + "," + o[j][1] + ')'; } x += ']'; - print(t[0], i+1, exon[i][0], exon[i][1], x); + print(t[0], i+1, t[2], exon[i][0], exon[i][1], x); } } else ++n_novel; } @@ -242,7 +243,7 @@ file.close(); buf.destroy(); -if (!print_err) { +if (!print_err && !print_corr) { print("Number of unmapped reads: " + n_unmapped); print("Number of mapped reads: " + n_mapped); print("Number of mapped exons: " + n_exon); From 2d11aaa830103b30789436743477122479dbfff7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 10:04:16 -0400 Subject: [PATCH 12/39] option to print all exons for debugging --- misc/exon-eval.js | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/misc/exon-eval.js b/misc/exon-eval.js index 8ba14b7..e6a9871 100644 --- a/misc/exon-eval.js +++ b/misc/exon-eval.js @@ -123,12 +123,13 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_err = false, print_corr = false; -while ((c = getopt(arguments, "l:r:ec")) != null) { +var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_ovlp = false, print_err_only = false, first_only = false; +while ((c = getopt(arguments, "l:r:ep1")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); - else if (c == 'e') print_err = true; - else if (c == 'c') print_corr = true; + else if (c == 'p') print_ovlp = true; + else if (c == 'e') print_err_only = print_ovlp = true; + else if (c == '1') first_only = true; } if (arguments.length - getopt.ind < 2) { @@ -162,7 +163,7 @@ for (var chr in anno) { Interval.index_end(e); } -var n_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0; +var n_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0, n_pri = 0; var n_ext_hit = 0, n_int_hit = 0, n_sgl_hit = 0; file = new File(arguments[getopt.ind+1]); @@ -171,11 +172,16 @@ var re_cigar = /(\d+)([MIDNSH])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); if (t[0].charAt(0) == '@') continue; - if (last_qname == t[0]) continue; + var flag = parseInt(t[1]); + if (flag&0x100) continue; + if (first_only && last_qname == t[0]) continue; if (t[2] == '*') { ++n_unmapped; continue; - } else ++n_mapped; + } else { + ++n_pri; + if (last_qname != t[0]) ++n_mapped; + } var st = parseInt(t[3]) - 1, en = st, exon_st = st; var exon = []; while ((m = re_cigar.exec(t[5])) != null) { @@ -225,16 +231,22 @@ while (file.readline(buf) >= 0) { } if (hit) break; } - if ((print_err && !hit) || (print_corr && hit)) { + if (print_ovlp) { + var type = hit? 'C' : 'P'; + if (hit && print_err_only) continue; var x = '['; for (var j = 0; j < o.length; ++j) { if (j) x += ', '; x += '(' + o[j][0] + "," + o[j][1] + ')'; } x += ']'; - print(t[0], i+1, t[2], exon[i][0], exon[i][1], x); + print(type, t[0], i+1, t[2], exon[i][0], exon[i][1], x); } - } else ++n_novel; + } else { + ++n_novel; + if (print_ovlp) + print('N', t[0], i+1, t[2], exon[i][0], exon[i][1]); + } } } last_qname = t[0]; @@ -243,9 +255,10 @@ file.close(); buf.destroy(); -if (!print_err && !print_corr) { +if (!print_ovlp) { print("Number of unmapped reads: " + n_unmapped); print("Number of mapped reads: " + n_mapped); + print("Number of primary alignments: " + n_pri); print("Number of mapped exons: " + n_exon); print("Number of novel exons: " + n_novel); print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); From 0f4c823b0c3427b5a56b38ee9e221b60ab113745 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 10:58:16 -0400 Subject: [PATCH 13/39] r286: ignore introns when computing max seg score --- align.c | 25 ++++++++++++++----------- main.c | 2 +- map.c | 2 +- mmpriv.h | 5 +++++ 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/align.c b/align.c index b9aa5bf..f96d4a9 100644 --- a/align.c +++ b/align.c @@ -61,10 +61,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c return 0; } -static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) +static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) { uint32_t k, l, toff = 0, qoff = 0; - int32_t s = 0, max = 0; + int32_t s = 0, max = 0, min_intron_len; + min_intron_len = mm_min_intron_len(q, e, q_intron); if (p == 0) return; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; @@ -88,12 +89,14 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t if (s < 0) s = 0; } else if (op == 2) { int n_ambi = 0; - for (l = 0; l < len; ++l) - if (tseq[toff + l] > 3) ++n_ambi; - toff += len, p->blen += len; - p->n_ambi += n_ambi, p->n_diff += len - n_ambi; - s -= q + e * len; - if (s < 0) s = 0; + if (len < min_intron_len) { + for (l = 0; l < len; ++l) + if (tseq[toff + l] > 3) ++n_ambi; + toff += len, p->blen += len; + p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + s -= q + e * len; + if (s < 0) s = 0; + } else toff += len, p->blen += len; } } p->dp_max = max; @@ -133,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->flag & MM_F_CDNA) - ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, -1, opt->zdrop, flag, ez); + ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, 0, -1, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else @@ -359,7 +362,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); } kfree(km, tseq); @@ -400,7 +403,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/main.c b/main.c index 17d3e29..ecf2f7c 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r282-dirty" +#define MM_VERSION "2.0-r286-dirty" void liftrlimit() { diff --git a/map.c b/map.c index dd1624c..df13811 100644 --- a/map.c +++ b/map.c @@ -348,7 +348,7 @@ static void *worker_pipeline(void *shared, int step, void *in) for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); if (p->opt->flag & MM_F_CDNA) - intron_thres = (int)((float)(p->opt->q2 - p->opt->q) / p->opt->e + 0.999f); + intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; diff --git a/mmpriv.h b/mmpriv.h index 6f22b0b..6f69467 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -60,4 +60,9 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc); } #endif +static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron) +{ + return q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX; +} + #endif From d24031874136b83b5dd30ea3a4375ee4a74c7a28 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 12:26:04 -0400 Subject: [PATCH 14/39] r287: refined CLI options and manpage --- align.c | 8 ++++---- main.c | 43 +++++++++++++++++++++++++++---------------- map.c | 6 +++--- minimap.h | 2 +- minimap2.1 | 28 ++++++++++++++++++++++++++-- 5 files changed, 61 insertions(+), 26 deletions(-) diff --git a/align.c b/align.c index f96d4a9..a348e59 100644 --- a/align.c +++ b/align.c @@ -135,7 +135,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr); for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } - if (opt->flag & MM_F_CDNA) + if (opt->flag & MM_F_SPLICE) ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, 0, -1, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); @@ -259,7 +259,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw = (int)(opt->bw * 1.5 + 1.); r2->cnt = 0; - if (!(opt->flag & MM_F_CDNA)) + if (!(opt->flag & MM_F_SPLICE)) mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1); else as1 = r->as, cnt1 = r->cnt; mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10); @@ -362,7 +362,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); } kfree(km, tseq); @@ -403,7 +403,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_CDNA)? opt->q2 : 0); + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/main.c b/main.c index ecf2f7c..c35a1bd 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r286-dirty" +#define MM_VERSION "2.0-r287-dirty" void liftrlimit() { @@ -31,6 +31,8 @@ static struct option long_options[] = { { "max-chain-skip", required_argument, 0, 0 }, { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, + { "splice", no_argument, 0, 0 }, + { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, { "min-chain-score",required_argument, 0, 'm' }, @@ -40,10 +42,21 @@ static struct option long_options[] = { { 0, 0, 0, 0} }; +static inline int64_t mm_parse_num(const char *str) +{ + double x; + char *p; + x = strtod(optarg, &p); + if (*p == 'G' || *p == 'g') x *= 1e9; + else if (*p == 'M' || *p == 'm') x *= 1e6; + else if (*p == 'K' || *p == 'k') x *= 1e3; + return (int64_t)(x + .499); +} + int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0; + int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0; int minibatch_size = 200000000; uint64_t batch_size = 4000000000ULL; mm_bseq_file_t *fp = 0; @@ -59,12 +72,12 @@ int main(int argc, char *argv[]) else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I - else if (c == 'r') opt.bw = atoi(optarg); + else if (c == 'r') opt.bw = (int)mm_parse_num(optarg); else if (c == 'f') opt.mid_occ_frac = atof(optarg); else if (c == 't') n_threads = atoi(optarg); else if (c == 'v') mm_verbose = atoi(optarg); - else if (c == 'g') opt.max_gap = atoi(optarg); - else if (c == 'G') opt.max_gap_ref = atoi(optarg); + else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg); + else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg); else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); @@ -80,6 +93,8 @@ int main(int argc, char *argv[]) else if (c == 'B') opt.b = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); + else if (c == 'I') batch_size = mm_parse_num(optarg); + else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc @@ -89,6 +104,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq + else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice else if (c == 'V') { puts(MM_VERSION); return 0; @@ -98,15 +114,6 @@ int main(int argc, char *argv[]) } else if (c == 'E') { opt.e = opt.e2 = strtol(optarg, &s, 10); if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); - } else if (c == 'I' || c == 'K') { - double x; - char *p; - x = strtod(optarg, &p); - if (*p == 'G' || *p == 'g') x *= 1e9; - else if (*p == 'M' || *p == 'm') x *= 1e6; - else if (*p == 'K' || *p == 'k') x *= 1e3; - if (c == 'I') batch_size = (uint64_t)(x + .499); - else minibatch_size = (uint64_t)(x + .499); } else if (c == 'x') { if (strcmp(optarg, "ava-ont") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; @@ -130,9 +137,9 @@ int main(int argc, char *argv[]) k = 19, w = 19; opt.a = 1, opt.b = 9, opt.q = 16, opt.q2 = 41, opt.e = 2, opt.e2 = 1, opt.zdrop = 200; opt.min_dp_max = 200; - } else if (strcmp(optarg, "cdna") == 0) { + } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { k = 15, w = 5; - opt.flag |= MM_F_CDNA; + opt.flag |= MM_F_SPLICE; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.zdrop = 200; @@ -143,6 +150,8 @@ int main(int argc, char *argv[]) } } if (w < 0) w = (int)(.6666667 * k + .499); + if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0) + opt.max_gap_ref = opt.bw = max_intron_len; if (argc == optind) { fprintf(stderr, "Usage: minimap2 [options] | [query.fa] [...]\n"); @@ -163,6 +172,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n"); fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); + fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [100k]\n"); fprintf(stderr, " Alignment:\n"); fprintf(stderr, " -A INT matching score [%d]\n", opt.a); fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); @@ -187,6 +197,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); + fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); return 1; } diff --git a/map.c b/map.c index df13811..f4ea657 100644 --- a/map.c +++ b/map.c @@ -245,7 +245,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x)); max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap; - n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_CDNA), n_a, a, &u, b->km); + n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km); regs = mm_gen_regs(b->km, qlen, n_u, u, a); *n_regs = n_u; @@ -258,7 +258,7 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap mm_set_parent(b->km, opt->mask_level, *n_regs, regs); mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); - if (!(opt->flag & MM_F_CDNA)) + if (!(opt->flag & MM_F_SPLICE)) mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle } if (opt->flag & MM_F_CIGAR) { @@ -347,7 +347,7 @@ static void *worker_pipeline(void *shared, int step, void *in) int intron_thres = -1; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); - if (p->opt->flag & MM_F_CDNA) + if (p->opt->flag & MM_F_SPLICE) intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { diff --git a/minimap.h b/minimap.h index dbd2e3e..6e0569b 100644 --- a/minimap.h +++ b/minimap.h @@ -14,7 +14,7 @@ #define MM_F_NO_QUAL 0x10 #define MM_F_OUT_CG 0x20 #define MM_F_OUT_CS 0x40 -#define MM_F_CDNA 0x80 +#define MM_F_SPLICE 0x80 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 2c85f46..b9113f1 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when .B -X is applied. .TP +.BI -G \ NUM +Maximal intron length in the splice mode. This option also changes the +bandwidth to +.IR NUM . +Increasing this option slows down spliced alignment. +.TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming for chaining. The time complexity is quadratic in the number of seeds. This @@ -277,13 +283,13 @@ Long assembly to reference mapping .B -w19 -A1 -B9 -O16,41 -E2,1 -s200 .BR -z200 ). Up to 10% sequence divergence. -.TP 8 +.TP .B ava-pb PacBio all-vs-all overlap mapping .RB ( -Hk19 .B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip .BR 25 ). -.TP 8 +.TP .B ava-ont Oxford Nanopore all-vs-all overlap mapping .RB ( -k15 @@ -292,6 +298,20 @@ Oxford Nanopore all-vs-all overlap mapping Similarly, the major difference from .B ava-pb is that this preset is not using HPC minimizers. +.TP +.B splice +Long-read spliced alignment +.RB ( -k15 +.B -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 +.BR -z200 ). +As of now, minimap2 only finds approximate exon boundaries. The true boundaries +are usually within 10bp around the reported positions. In the splice mode, +1) long deletions are taken as introns and represented as the +.RB ` N ' +CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap +costs are different during chaining; 4) the computation of the +.RB ` ms ' +tag ignores introns to demote hits to pseudogenes. .RE .SS Miscellaneous options .TP 10 @@ -367,6 +387,10 @@ Minimap2 does not work well with Illumina short reads as of now. * Minimap2 requires SSE2 instructions to compile. It is possible to add non-SSE2 support, but it would make minimap2 slower by several times. +.TP +* +In the splice mode, minimap2 is unable to find the precise exon boundaries. +The true bounraries are usually within 10bp around the reported locations. .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1). From 5a74088b74dc16c3c9db1d9b609f1ecaed7139e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 12:39:21 -0400 Subject: [PATCH 15/39] r288: changed max intron length to 200k --- main.c | 4 ++-- minimap2.1 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main.c b/main.c index c35a1bd..167a6f5 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r287-dirty" +#define MM_VERSION "2.0-r288-dirty" void liftrlimit() { @@ -140,7 +140,7 @@ int main(int argc, char *argv[]) } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { k = 15, w = 5; opt.flag |= MM_F_SPLICE; - opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 100000; + opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.zdrop = 200; } else { diff --git a/minimap2.1 b/minimap2.1 index b9113f1..e403731 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -163,7 +163,7 @@ secondary alignments [5]. This option has no effect when is applied. .TP .BI -G \ NUM -Maximal intron length in the splice mode. This option also changes the +Maximal intron length in the splice mode [200k]. This option also changes the bandwidth to .IR NUM . Increasing this option slows down spliced alignment. @@ -302,7 +302,7 @@ is that this preset is not using HPC minimizers. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 .BR -z200 ). As of now, minimap2 only finds approximate exon boundaries. The true boundaries are usually within 10bp around the reported positions. In the splice mode, From a23df2dc9138392ec7e61e6155c03a80ba8fce8c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 12:40:07 -0400 Subject: [PATCH 16/39] r289: changed CLI help only --- main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.c b/main.c index 167a6f5..b85e4f1 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r288-dirty" +#define MM_VERSION "2.0-r289-dirty" void liftrlimit() { @@ -172,7 +172,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n"); fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); - fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [100k]\n"); + fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [200k]\n"); fprintf(stderr, " Alignment:\n"); fprintf(stderr, " -A INT matching score [%d]\n", opt.a); fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); From 53b3265d8447bd8f7889d6e7b74fd8723057ef7f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 15:40:49 -0400 Subject: [PATCH 17/39] r290: in techrep, explain spliced alignment --- main.c | 4 ++-- minimap2.1 | 2 +- tex/minimap2.tex | 53 +++++++++++++++++++++++++++++++++++------------- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/main.c b/main.c index b85e4f1..4802fbb 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r289-dirty" +#define MM_VERSION "2.0-r290-dirty" void liftrlimit() { @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); - fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G100k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); + fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); return 1; } diff --git a/minimap2.1 b/minimap2.1 index e403731..ee8efb5 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "8 August 2017" "minimap2-2.0-r275" "Bioinformatics tools" +.TH minimap2 1 "12 August 2017" "minimap2-2.0-r290-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/tex/minimap2.tex b/tex/minimap2.tex index cd20bf7..57f8a37 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -72,17 +72,17 @@ sorted by ending reference position $x$, let $f(i)$ be the maximal chaining score up to the $i$-th anchor in the list. $f(i)$ can be calculated with dynamic programming (DP): \begin{equation}\label{eq:chain} -f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\} +f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\beta_c(j,i) \},w_i\big\} \end{equation} where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of -matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It +matching bases between the two anchors. $\beta_c(j,i)>0$ is the gap cost. It equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the distance between two anchors is too large); otherwise -\[ -\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\}) -\] -In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot -l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with +\begin{equation}\label{eq:chain-gap} +\beta_c(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big) +\end{equation} +In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot +|l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster chaining algorithms exist~\citep{Abouelhoda:2005aa}, they are inapplicable to generic gap cost, complex to implement and usually @@ -134,13 +134,15 @@ but with a 1000bp band, it is considerably faster. When performing global alignment between anchors, we expect the alignment to stay close to the diagonal of the DP matrix. Banding is often applicable. -Minimap2 uses a 2-piece affine gap cost -$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$. +Minimap2 uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}: +\begin{equation}\label{eq:2-piece} +\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\} +\end{equation} On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this -cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than +cost function is concave. It applies cost $q+|l|\cdot e$ to gaps shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies -$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover -longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}). +$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover +longer insertions and deletions~(INDEL). With global alignment, minimap2 may force to align unrelated sequences between two adjacent anchors. To avoid such an artifact, we compute accumulative @@ -150,7 +152,7 @@ the alignment score along the alignment path ending at cell $(i,j)$ in the DP matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\}) +S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')| \] where $e$ is the gap extension cost and $Z$ is an arbitrary threshold. This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}. @@ -162,6 +164,30 @@ alignment between the two subsequences involved in the global alignment, but this time with the one subsequence reverse complemented. This additional alignment step may identify short inversions that are missed during chaining. +\subsection{Spliced alignment} + +The algorithm described above can be adapted to spliced alignment. In this +mode, the chaining gap cost distinguishes insertions to and deletions from the +reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of +\[ +\gamma_c(l)=\left\{\begin{array}{ll} +0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\ +\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0) +\end{array}\right. +\] +Similarly, the gap cost function used for DP-based alignment is changed to +\[ +\gamma_a(l)=\left\{\begin{array}{ll} +q+l\cdot e & (l>0) \\ +\min\{q+|l|\cdot e,\tilde{q}\} & (l<0) +\end{array}\right. +\] +In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is +regarded as an intron, which pays no cost to gap extensions. With these +modifications, minimap2 can retain multiple reasonably long introns in one +alignment, rather than fragment the alignment into local hits, which often +leads to the loss of small exons especially given noisy reads. + \end{methods} \section{Results} @@ -232,7 +258,6 @@ issues. \bibliography{minimap2} -\pagebreak \appendix \begin{methods} From 61eef0575cfa904ee79d831a2510e381c26b85ef Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 18:54:32 -0400 Subject: [PATCH 18/39] separate out spliced alignment; not right yet --- Makefile | 6 +- align.c | 2 +- ksw2.h | 4 +- ksw2_extd2_sse.c | 21 --- ksw2_exts2_sse.c | 327 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 332 insertions(+), 28 deletions(-) create mode 100644 ksw2_exts2_sse.c diff --git a/Makefile b/Makefile index 839636e..f7892e8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= -DHAVE_KALLOC INCLUDES= -I. -OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_extd2_noins_sse.o ksw2_ll_sse.o \ +OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ksw2_ll_sse.o \ misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite @@ -33,9 +33,6 @@ libminimap2.a:$(OBJS) sdust:sdust.c kalloc.o kalloc.h kdq.h kvec.h kseq.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz -ksw2_extd2_noins_sse.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_NO_LONG_INS $(INCLUDES) $< -o $@ - clean: rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session* @@ -53,6 +50,7 @@ hit.o: mmpriv.h minimap.h bseq.h kalloc.h index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h kalloc.o: kalloc.h ksw2_extd2_sse.o: ksw2.h kalloc.h +ksw2_exts2_sse.o: ksw2.h kalloc.h ksw2_extz2_sse.o: ksw2.h kalloc.h ksw2_ll_sse.o: ksw2.h kalloc.h main.o: bseq.h minimap.h mmpriv.h diff --git a/align.c b/align.c index a348e59..7d1a1c6 100644 --- a/align.c +++ b/align.c @@ -136,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->flag & MM_F_SPLICE) - ksw_extd2_noins_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, 0, -1, opt->zdrop, flag, ez); + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else diff --git a/ksw2.h b/ksw2.h index 5f885eb..7e94391 100644 --- a/ksw2.h +++ b/ksw2.h @@ -53,8 +53,8 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); -void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); +void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t gapo, int8_t gape, int8_t gapo2, int zdrop, int flag, ksw_extz_t *ez); void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index 532159b..c7f5843 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -10,13 +10,8 @@ #include #endif -#if !defined(KSW_NO_LONG_INS) void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) -#else -void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) -#endif { #define __dp_code_block1 \ z = _mm_load_si128(&s[t]); \ @@ -177,9 +172,7 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con z = _mm_max_epi8(z, a); z = _mm_max_epi8(z, b); z = _mm_max_epi8(z, a2); - #ifndef KSW_NO_LONG_INS z = _mm_max_epi8(z, b2); - #endif z = _mm_min_epi8(z, sc_mch_); __dp_code_block2; // save u[] and v[]; update a, b, a2 and b2 _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); @@ -204,10 +197,8 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); tmp = _mm_cmpgt_epi8(a2, zero_); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); - #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, zero_); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); - #endif #endif } } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment @@ -223,10 +214,8 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con z = _mm_max_epi8(z, b); d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d z = _mm_max_epi8(z, a2); - #ifndef KSW_NO_LONG_INS d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d z = _mm_max_epi8(z, b2); - #endif z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(a, z); @@ -238,11 +227,9 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con tmp = _mm_cmpgt_epi8(a2, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); - #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2)); - #endif tmp = _mm_cmplt_epi8(sc_mch_, z); z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif @@ -256,11 +243,9 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con tmp = _mm_cmpgt_epi8(a2, zero_); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_)); d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 - #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(b2, zero_); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_)); d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 - #endif _mm_store_si128(&pr[t], d); } } else { // gap right-alignment @@ -276,10 +261,8 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con z = _mm_max_epi8(z, b); d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 z = _mm_max_epi8(z, a2); - #ifndef KSW_NO_LONG_INS d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4 z = _mm_max_epi8(z, b2); - #endif z = _mm_min_epi8(z, sc_mch_); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(z, a); @@ -291,11 +274,9 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con tmp = _mm_cmpgt_epi8(z, a2); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); - #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(z, b2); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2)); - #endif tmp = _mm_cmplt_epi8(sc_mch_, z); z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); #endif @@ -309,11 +290,9 @@ void ksw_extd2_noins_sse(void *km, int qlen, const uint8_t *query, int tlen, con tmp = _mm_cmpgt_epi8(zero_, a2); _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), qe2_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 - #ifndef KSW_NO_LONG_INS tmp = _mm_cmpgt_epi8(zero_, b2); _mm_store_si128(&y2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b2), qe2_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0 - #endif _mm_store_si128(&pr[t], d); } } diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c new file mode 100644 index 0000000..5fb5e97 --- /dev/null +++ b/ksw2_exts2_sse.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include "ksw2.h" + +#ifdef __SSE2__ +#include + +#ifdef __SSE4_1__ +#include +#endif + +void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int zdrop, int flag, ksw_extz_t *ez) +{ +#define __dp_code_block1 \ + z = _mm_load_si128(&s[t]); \ + xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ + xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ + vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ + vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ + a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ + ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ + b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \ + x2t1= _mm_load_si128(&x2[t]); \ + tmp = _mm_srli_si128(x2t1, 15); \ + x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \ + x21_= tmp; \ + a2= _mm_add_epi8(x2t1, vt1); + +#define __dp_code_block2 \ + _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ + _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ + tmp = _mm_sub_epi8(z, q_); \ + a = _mm_sub_epi8(a, tmp); \ + b = _mm_sub_epi8(b, tmp); \ + a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_)); + + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; + __m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, m1_; + __m128i *u, *v, *x, *y, *x2, *s, *p = 0; + + ksw_reset_extz(ez); + if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; + + zero_ = _mm_set1_epi8(0); + q_ = _mm_set1_epi8(q); + q2_ = _mm_set1_epi8(q2); + qe_ = _mm_set1_epi8(q + e); + sc_mch_ = _mm_set1_epi8(mat[0]); + sc_mis_ = _mm_set1_epi8(mat[1]); + m1_ = _mm_set1_epi8(m - 1); // wildcard + + tlen_ = (tlen + 15) / 16; + n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1; + qlen_ = (qlen + 15) / 16; + for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { + max_sc = max_sc > mat[t]? max_sc : mat[t]; + min_sc = min_sc < mat[t]? min_sc : mat[t]; + } + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches + + long_thres = (q2 - q) / e - 1; + if (q2 > q + e + long_thres * e) + ++long_thres; + long_diff = long_thres * e - (q2 - q); + + mem = (uint8_t*)kcalloc(km, tlen_ * 7 + qlen_ + 1, 16); + u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_; + s = x2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + memset(u, -q - e, tlen_ * 16); + memset(v, -q - e, tlen_ * 16); + memset(x, -q - e, tlen_ * 16); + memset(y, -q - e, tlen_ * 16); + memset(x2, -q2, tlen_ * 16); + if (!approx_max) { + H = (int32_t*)kmalloc(km, tlen_ * 16 * 4); + for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF; + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16); + p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } + + for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t]; + memcpy(sf, target, tlen); + + for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { + int st = 0, en = tlen - 1, st0, en0, st_, en_; + int8_t x1, x21, v1; + uint8_t *qrr = qr + (qlen - 1 - r); + int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v; + __m128i x1_, x21_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; + st0 = st, en0 = en; + st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; + // set boundary conditions + x1 = -q - e, x21 = -q2; + v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; + if (en >= r) { + ((int8_t*)y)[r] = -q - e; + u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; + } + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { + __m128i sq, st, tmp, mask; + sq = _mm_loadu_si128((__m128i*)&sf[t]); + st = _mm_loadu_si128((__m128i*)&qrr[t]); + mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); + tmp = _mm_cmpeq_epi8(sq, st); +#ifdef __SSE4_1__ + tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); +#else + tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); +#endif + tmp = _mm_andnot_si128(mask, tmp); + _mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop + x1_ = _mm_cvtsi32_si128((uint8_t)x1); + x21_ = _mm_cvtsi32_si128((uint8_t)x21); + v1_ = _mm_cvtsi32_si128((uint8_t)v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { + __m128i z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + z = _mm_max_epi8(z, a); + z = _mm_max_epi8(z, b); + z = _mm_max_epi8(z, a2); + z = _mm_min_epi8(z, sc_mch_); + __dp_code_block2; // save u[] and v[]; update a, b and a2 + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), q2_)); +#else + tmp = _mm_cmpgt_epi8(a, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(b, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(a2, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); + __dp_code_block2; + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); + tmp = _mm_cmpgt_epi8(a2, zero_); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), q2_)); +#endif + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment + __m128i *pr = p + r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0 + z = _mm_max_epi8(z, a); + d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d + z = _mm_max_epi8(z, b); + d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d + z = _mm_max_epi8(z, a2); + z = _mm_min_epi8(z, sc_mch_); +#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() + tmp = _mm_cmpgt_epi8(a, z); + d = _mm_and_si128(tmp, _mm_set1_epi8(1)); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(b, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2))); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(a2, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +#endif + __dp_code_block2; + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 + tmp = _mm_cmpgt_epi8(a2, zero_); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), q2_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + _mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment + __m128i *pr = p + r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __dp_code_block1; +#ifdef __SSE4_1__ + d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1 + z = _mm_max_epi8(z, a); + d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 + z = _mm_max_epi8(z, b); + d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 + z = _mm_max_epi8(z, a2); + z = _mm_min_epi8(z, sc_mch_); +#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() + tmp = _mm_cmpgt_epi8(z, a); + d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a)); + tmp = _mm_cmpgt_epi8(z, b); + d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2))); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b)); + tmp = _mm_cmpgt_epi8(z, a2); + d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); + tmp = _mm_cmplt_epi8(sc_mch_, z); + z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); +#endif + __dp_code_block2; + tmp = _mm_cmpgt_epi8(zero_, a); + _mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0 + tmp = _mm_cmpgt_epi8(zero_, b); + _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 + tmp = _mm_cmpgt_epi8(zero_, a2); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), q2_)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + _mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array + int32_t max_H, max_t; + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; + __m128i max_H_, max_t_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element + max_t = en0; + max_H_ = _mm_set1_epi32(max_H); + max_t_ = _mm_set1_epi32(max_t); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; + __m128i H1, tmp, t_; + H1 = _mm_loadu_si128((__m128i*)&H[t]); + t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); + H1 = _mm_add_epi32(H1, t_); + _mm_storeu_si128((__m128i*)&H[t], H1); + t_ = _mm_set1_epi32(t); + tmp = _mm_cmpgt_epi32(H1, max_H_); +#ifdef __SSE4_1__ + max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); + max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); +#else + max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); + max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); +#endif + } + _mm_storeu_si128((__m128i*)HH, max_H_); + _mm_storeu_si128((__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE + H[t] += (int32_t)v8[t]; + if (H[t] > max_H) + max_H = H[t], max_t = t; + } + } else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0 + // update ez + if (en0 == tlen - 1 && H[en0] > ez->mte) + ez->mte = H[en0], ez->mte_q = r - en; + if (r - st0 == qlen - 1 && H[st0] > ez->mqe) + ez->mqe = H[st0], ez->mqe_t = st0; + if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H[tlen - 1]; + } else { // find approximate max; Z-drop might be inaccurate, too. + if (r > 0) { + if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) { + int32_t d0 = v8[last_H0_t]; + int32_t d1 = u8[last_H0_t + 1]; + if (d0 > d1) H0 += d0; + else H0 += d1, ++last_H0_t; + } else if (last_H0_t >= st0 && last_H0_t <= en0) { + H0 += v8[last_H0_t]; + } else { + ++last_H0_t, H0 += u8[last_H0_t]; + } + } else H0 = v8[0] - qe, last_H0_t = 0; + if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, 0)) break; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H0; + } + last_st = st, last_en = en; + //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging + } + kfree(km, mem); + if (!approx_max) kfree(km, H); + if (with_cigar) { // backtrack + int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); + if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) + ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + else if (ez->max_t >= 0 && ez->max_q >= 0) + ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + kfree(km, mem2); kfree(km, off); + } +} +#endif // __SSE2__ From 17c19e5819b3b417c6ef44c48653eaca71b34028 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 19:06:59 -0400 Subject: [PATCH 19/39] exts2 working --- ksw2_exts2_sse.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 5fb5e97..80895d4 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -97,9 +97,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { int st = 0, en = tlen - 1, st0, en0, st_, en_; - int8_t x1, x21, v1; + int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v; uint8_t *qrr = qr + (qlen - 1 - r); - int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v; __m128i x1_, x21_, v1_; // find the boundaries if (st < r - qlen + 1) st = r - qlen + 1; @@ -107,8 +106,17 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin st0 = st, en0 = en; st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; // set boundary conditions - x1 = -q - e, x21 = -q2; - v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; + if (st > 0) { + if (st - 1 >= last_st && st - 1 <= last_en) { + x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round + } else { + x1 = -q - e, x21 = -q2; + v1 = -q - e; + } + } else { + x1 = -q - e, x21 = -q2; + v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; + } if (en >= r) { ((int8_t*)y)[r] = -q - e; u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; From bcfe00d2ad54178e736fe08758e24cee8689e03c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 19:09:33 -0400 Subject: [PATCH 20/39] minor formatting changes --- ksw2_exts2_sse.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 80895d4..54d484d 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -76,10 +76,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_; s = x2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; - memset(u, -q - e, tlen_ * 16); - memset(v, -q - e, tlen_ * 16); - memset(x, -q - e, tlen_ * 16); - memset(y, -q - e, tlen_ * 16); + memset(u, -q - e, tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array) memset(x2, -q2, tlen_ * 16); if (!approx_max) { H = (int32_t*)kmalloc(km, tlen_ * 16 * 4); @@ -107,12 +104,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; // set boundary conditions if (st > 0) { - if (st - 1 >= last_st && st - 1 <= last_en) { + if (st - 1 >= last_st && st - 1 <= last_en) x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round - } else { - x1 = -q - e, x21 = -q2; - v1 = -q - e; - } + else x1 = -q - e, x21 = -q2, v1 = -q - e; } else { x1 = -q - e, x21 = -q2; v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0; From 43506edbc5305118027abc5b267bd85604b6279b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 23:10:14 -0400 Subject: [PATCH 21/39] backup: preliminary boundary alignment --- align.c | 2 +- ksw2.h | 4 +- ksw2_exts2_sse.c | 100 ++++++++++++++++++++++++++++++----------------- main.c | 1 + minimap.h | 1 + 5 files changed, 70 insertions(+), 38 deletions(-) diff --git a/align.c b/align.c index 7d1a1c6..4f86ace 100644 --- a/align.c +++ b/align.c @@ -136,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->zdrop, flag, ez); + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag|KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else diff --git a/ksw2.h b/ksw2.h index 7e94391..3811628 100644 --- a/ksw2.h +++ b/ksw2.h @@ -12,6 +12,8 @@ #define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse #define KSW_EZ_EXTZ_ONLY 0x40 // only perform extension #define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output +#define KSW_EZ_SPLICE_FOR 0x100 +#define KSW_EZ_SPLICE_REV 0x200 #ifdef __cplusplus extern "C" { @@ -54,7 +56,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t gapo, int8_t gape, int8_t gapo2, int zdrop, int flag, ksw_extz_t *ez); + int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 54d484d..0694fb3 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -11,7 +11,7 @@ #endif void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int8_t q2, int zdrop, int flag, ksw_extz_t *ez) + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) { #define __dp_code_block1 \ z = _mm_load_si128(&s[t]); \ @@ -30,7 +30,8 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_srli_si128(x2t1, 15); \ x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \ x21_= tmp; \ - a2= _mm_add_epi8(x2t1, vt1); + a2 = _mm_add_epi8(x2t1, vt1); \ + a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t])); #define __dp_code_block2 \ _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ @@ -45,7 +46,7 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin int32_t *H = 0, H0 = 0, last_H0_t = 0; uint8_t *qr, *sf, *mem, *mem2 = 0; __m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, m1_; - __m128i *u, *v, *x, *y, *x2, *s, *p = 0; + __m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor; ksw_reset_extz(ez); if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return; @@ -72,10 +73,11 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin ++long_thres; long_diff = long_thres * e - (q2 - q); - mem = (uint8_t*)kcalloc(km, tlen_ * 7 + qlen_ + 1, 16); + mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16); u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_; - s = x2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + donor = x2 + tlen_, acceptor = donor + tlen_; + s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; memset(u, -q - e, tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array) memset(x2, -q2, tlen_ * 16); if (!approx_max) { @@ -92,6 +94,24 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t]; memcpy(sf, target, tlen); + // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! + if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { + memset(donor, -noncan, tlen_ * 16); + for (t = 0; t < tlen - 2; ++t) { + int is_can = 0; // is a canonical site + if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1; + if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1; + if (is_can) ((int8_t*)donor)[t] = 0; + } + memset(acceptor, -noncan, tlen_ * 16); + for (t = 2; t < tlen; ++t) { + int is_can = 0; + if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1; + if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1; + if (is_can) ((int8_t*)acceptor)[t] = 0; + } + } + for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { int st = 0, en = tlen - 1, st0, en0, st_, en_; int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v; @@ -143,49 +163,48 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin assert(en_ - st_ + 1 <= n_col_); if (!with_cigar) { // score only for (t = st_; t <= en_; ++t) { - __m128i z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp; __dp_code_block1; #ifdef __SSE4_1__ z = _mm_max_epi8(z, a); z = _mm_max_epi8(z, b); - z = _mm_max_epi8(z, a2); - z = _mm_min_epi8(z, sc_mch_); + z = _mm_max_epi8(z, a2a); __dp_code_block2; // save u[] and v[]; update a, b and a2 _mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_)); _mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_)); - _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), q2_)); + tmp = _mm_load_si128(&donor[t]); + _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_)); #else tmp = _mm_cmpgt_epi8(a, z); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a)); tmp = _mm_cmpgt_epi8(b, z); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); - tmp = _mm_cmpgt_epi8(a2, z); - z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); - tmp = _mm_cmplt_epi8(sc_mch_, z); - z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); + tmp = _mm_cmpgt_epi8(a2a, z); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a)); __dp_code_block2; tmp = _mm_cmpgt_epi8(a, zero_); _mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_)); tmp = _mm_cmpgt_epi8(b, zero_); _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); - tmp = _mm_cmpgt_epi8(a2, zero_); - _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), q2_)); + tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct + tmp = _mm_cmpgt_epi8(a2, tmp); + tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2)); + _mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_)); #endif } } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment __m128i *pr = p + r * n_col_ - st_; off[r] = st, off_end[r] = en; for (t = st_; t <= en_; ++t) { - __m128i d, z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; __dp_code_block1; #ifdef __SSE4_1__ d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0 z = _mm_max_epi8(z, a); d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d z = _mm_max_epi8(z, b); - d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d - z = _mm_max_epi8(z, a2); - z = _mm_min_epi8(z, sc_mch_); + d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d + z = _mm_max_epi8(z, a2a); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(a, z); d = _mm_and_si128(tmp, _mm_set1_epi8(1)); @@ -193,11 +212,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(b, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2))); z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b)); - tmp = _mm_cmpgt_epi8(a2, z); + tmp = _mm_cmpgt_epi8(a2a, z); d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3))); - z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2)); - tmp = _mm_cmplt_epi8(sc_mch_, z); - z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); + z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a)); #endif __dp_code_block2; tmp = _mm_cmpgt_epi8(a, zero_); @@ -206,25 +223,31 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(b, zero_); _mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_)); d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 - tmp = _mm_cmpgt_epi8(a2, zero_); - _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), q2_)); - d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 + + tmp2 = _mm_load_si128(&donor[t]); + tmp = _mm_cmpgt_epi8(a2, tmp2); +#ifdef __SSE4_1__ + tmp2 = _mm_max_epi8(a2, tmp2); +#else + tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2)); +#endif + _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_)); + d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); _mm_store_si128(&pr[t], d); } } else { // gap right-alignment __m128i *pr = p + r * n_col_ - st_; off[r] = st, off_end[r] = en; for (t = st_; t <= en_; ++t) { - __m128i d, z, a, b, a2, xt1, x2t1, vt1, ut, tmp; + __m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2; __dp_code_block1; #ifdef __SSE4_1__ d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1 z = _mm_max_epi8(z, a); d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2 z = _mm_max_epi8(z, b); - d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3 - z = _mm_max_epi8(z, a2); - z = _mm_min_epi8(z, sc_mch_); + d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3 + z = _mm_max_epi8(z, a2a); #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() tmp = _mm_cmpgt_epi8(z, a); d = _mm_andnot_si128(tmp, _mm_set1_epi8(1)); @@ -232,11 +255,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(z, b); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2))); z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b)); - tmp = _mm_cmpgt_epi8(z, a2); + tmp = _mm_cmpgt_epi8(z, a2a); d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3))); - z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2)); - tmp = _mm_cmplt_epi8(sc_mch_, z); - z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z)); + z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a)); #endif __dp_code_block2; tmp = _mm_cmpgt_epi8(zero_, a); @@ -245,8 +266,15 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin tmp = _mm_cmpgt_epi8(zero_, b); _mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0 - tmp = _mm_cmpgt_epi8(zero_, a2); - _mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), q2_)); + + tmp2 = _mm_load_si128(&donor[t]); + tmp = _mm_cmpgt_epi8(tmp2, a2); +#ifdef __SSE4_1__ + tmp2 = _mm_max_epi8(tmp2, a2); +#else + tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2)); +#endif + _mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_)); d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0 _mm_store_si128(&pr[t], d); } diff --git a/main.c b/main.c index 4802fbb..5722433 100644 --- a/main.c +++ b/main.c @@ -142,6 +142,7 @@ int main(int argc, char *argv[]) opt.flag |= MM_F_SPLICE; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; + opt.noncan = 4; opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); diff --git a/minimap.h b/minimap.h index 6e0569b..c145919 100644 --- a/minimap.h +++ b/minimap.h @@ -94,6 +94,7 @@ typedef struct { int min_join_flank_sc; int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties + int noncan; int zdrop; int min_dp_max; int min_ksw_len; From 28f86688ab962359d7c983abdd6a377fa3ace66b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Aug 2017 23:48:43 -0400 Subject: [PATCH 22/39] r295: gap closure from the middle of non-HPC k This WILL slightly affect the result of genomic mapping, but hopefully in the good direction. --- align.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/align.c b/align.c index 4f86ace..cca69cf 100644 --- a/align.c +++ b/align.c @@ -164,8 +164,8 @@ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2], c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x); *r = (int32_t)a->x + 1 - c; } else { - *r = (int32_t)a->x + 1; - *q = (int32_t)a->y + 1; + *r = (int32_t)a->x - (mi->k>>1); + *q = (int32_t)a->y - (mi->k>>1); } } diff --git a/main.c b/main.c index 5722433..42dafd1 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r290-dirty" +#define MM_VERSION "2.0-r295-dirty" void liftrlimit() { From b5f5929bf9923f7ef69add35b059e9525e456d96 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 13 Aug 2017 21:37:51 -0400 Subject: [PATCH 23/39] r296: expose splicing related options to CLI --- align.c | 15 ++++++++++----- main.c | 20 ++++++++++++++++---- minimap.h | 20 +++++++++++--------- minimap2.1 | 27 +++++++++++++++++---------- 4 files changed, 54 insertions(+), 28 deletions(-) diff --git a/align.c b/align.c index cca69cf..b4c00b1 100644 --- a/align.c +++ b/align.c @@ -136,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag|KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV, ez); + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else @@ -249,7 +249,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int { int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq; - int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0; + int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; int8_t mat[25]; @@ -266,6 +266,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); + if (opt->flag & MM_F_SPLICE) { + if (opt->flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; + if (opt->flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + } + // compute rs0 and qs0 if (r->split && as1 > 0) { mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0); @@ -298,7 +303,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs0, rs, tseq); mm_seq_rev(qs - qs0, qseq); mm_seq_rev(rs - rs0, tseq); - mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -320,7 +325,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw1 = qe - qs > re - rs? qe - qs : re - rs; qseq = &qseq0[rev][qs]; mm_idx_getseq(mi, rid, rs, re, tseq); - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez); + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez); if (ez->n_cigar > 0) @@ -345,7 +350,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (!dropped && qe < qe0 && re < re0) { // right extension qseq = &qseq0[rev][qe]; mm_idx_getseq(mi, rid, re, re0, tseq); - mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; diff --git a/main.c b/main.c index 42dafd1..f387f2d 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r295-dirty" +#define MM_VERSION "2.0-r296-dirty" void liftrlimit() { @@ -32,6 +32,7 @@ static struct option long_options[] = { { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, + { "cost-non-gt-ag", required_argument, 0, 0 }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, @@ -67,7 +68,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -105,9 +106,19 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice + else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag else if (c == 'V') { puts(MM_VERSION); return 0; + } else if (c == 'u') { + if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; + else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; + else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; + else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); + else { + fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__); + return 1; + } } else if (c == 'O') { opt.q = opt.q2 = strtol(optarg, &s, 10); if (*s == ',') opt.q2 = strtol(s + 1, &s, 10); @@ -139,7 +150,7 @@ int main(int argc, char *argv[]) opt.min_dp_max = 200; } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { k = 15, w = 5; - opt.flag |= MM_F_SPLICE; + opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.noncan = 4; @@ -181,6 +192,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop); fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); + fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(stderr, " Input/Output:\n"); fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); @@ -198,7 +210,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); - fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); + fprintf(stderr, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); return 1; } diff --git a/minimap.h b/minimap.h index c145919..bce2448 100644 --- a/minimap.h +++ b/minimap.h @@ -5,16 +5,18 @@ #include #include -#define MM_IDX_DEF_B 14 +#define MM_IDX_DEF_B 14 -#define MM_F_NO_SELF 0x01 -#define MM_F_AVA 0x02 -#define MM_F_CIGAR 0x04 -#define MM_F_OUT_SAM 0x08 -#define MM_F_NO_QUAL 0x10 -#define MM_F_OUT_CG 0x20 -#define MM_F_OUT_CS 0x40 -#define MM_F_SPLICE 0x80 +#define MM_F_NO_SELF 0x001 +#define MM_F_AVA 0x002 +#define MM_F_CIGAR 0x004 +#define MM_F_OUT_SAM 0x008 +#define MM_F_NO_QUAL 0x010 +#define MM_F_OUT_CG 0x020 +#define MM_F_OUT_CS 0x040 +#define MM_F_SPLICE 0x080 +#define MM_F_SPLICE_FOR 0x100 +#define MM_F_SPLICE_REV 0x200 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index ee8efb5..5b8aee3 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "12 August 2017" "minimap2-2.0-r290-dirty" "Bioinformatics tools" +.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -205,6 +205,18 @@ the contiguity of the alignment at the cost of poor alignment in the middle Minimal peak DP alignment score to output [40]. The peak score is computed from the final CIGAR. It is the score of the max scoring segment in the alignment and may be different from the total alignment score. +.TP +.BI -u \ CHAR +How to find canonical splicing sites GT-AG - +.BR f : +transcript strand; +.BR b : +both strands; +.BR n : +no attempt to match GT-AG [n] +.TP +.BI --cost-non-gt-ag \ INT +Cost of non-canonical splicing sites [0]. .SS Input/output options .TP 10 .B -Q @@ -302,11 +314,10 @@ is that this preset is not using HPC minimizers. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -.BR -z200 ). -As of now, minimap2 only finds approximate exon boundaries. The true boundaries -are usually within 10bp around the reported positions. In the splice mode, -1) long deletions are taken as introns and represented as the +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag +.BR 4 ). +In the splice mode, 1) long deletions are taken as introns and represented as +the .RB ` N ' CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap costs are different during chaining; 4) the computation of the @@ -387,10 +398,6 @@ Minimap2 does not work well with Illumina short reads as of now. * Minimap2 requires SSE2 instructions to compile. It is possible to add non-SSE2 support, but it would make minimap2 slower by several times. -.TP -* -In the splice mode, minimap2 is unable to find the precise exon boundaries. -The true bounraries are usually within 10bp around the reported locations. .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1). From 2cde8d257cf81ac6f9622c80283ea5560c2a5dcb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 17 Aug 2017 06:02:44 -0400 Subject: [PATCH 24/39] r297: bidirectional RNA alignment --- align.c | 45 +++++++++++++++++++++++++++++++++++++++------ format.c | 6 +++++- main.c | 5 +++-- map.c | 2 ++ minimap.h | 4 +++- mmpriv.h | 3 ++- 6 files changed, 54 insertions(+), 11 deletions(-) diff --git a/align.c b/align.c index b4c00b1..25744e9 100644 --- a/align.c +++ b/align.c @@ -64,7 +64,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) { uint32_t k, l, toff = 0, qoff = 0; - int32_t s = 0, max = 0, min_intron_len; + int32_t s = 0, max = 0, min_intron_len, n_gtag = 0, n_ctac = 0; min_intron_len = mm_min_intron_len(q, e, q_intron); if (p == 0) return; for (k = 0; k < p->n_cigar; ++k) { @@ -96,10 +96,19 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t p->n_ambi += n_ambi, p->n_diff += len - n_ambi; s -= q + e * len; if (s < 0) s = 0; - } else toff += len, p->blen += len; + } else { // intron + uint8_t b[4]; + b[0] = tseq[toff], b[1] = tseq[toff+1]; + b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1]; + if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag; + else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac; + toff += len, p->blen += len; + } } } p->dp_max = max; + if (n_gtag > n_ctac) p->trans_strand = 1; + else if (n_gtag < n_ctac) p->trans_strand = 2; } static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc() @@ -245,7 +254,7 @@ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int32_ } } -static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez) +static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez, int splice_flag) { int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq; @@ -267,8 +276,9 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); if (opt->flag & MM_F_SPLICE) { - if (opt->flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; - if (opt->flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; + if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + if (splice_flag & MM_F_SPLICE_BOTH) extra_flag |= KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV; } // compute rs0 and qs0 @@ -368,6 +378,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); + if (rev && r->p->trans_strand) + r->p->trans_strand ^= 3; // flip to the read strand } kfree(km, tseq); @@ -450,7 +462,28 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m memset(&ez, 0, sizeof(ksw_extz_t)); for (i = 0; i < n_regs; ++i) { mm_reg1_t r2; - mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, a, &ez); + if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) { + mm_reg1_t s[2], s2[2]; + int which, trans_strand; + s[0] = s[1] = regs[i]; + mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], a, &ez, MM_F_SPLICE_FOR); + mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], a, &ez, MM_F_SPLICE_REV); + if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1; + else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2; + else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively + if (which == 0) { + regs[i] = s[0], r2 = s2[0]; + free(s[1].p); + } else { + regs[i] = s[1], r2 = s2[1]; + free(s[0].p); + } + regs[i].p->trans_strand = trans_strand; + } else { + mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, a, &ez, opt->flag); + if ((opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SPLICE_BOTH)) + regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2; + } if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs); if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) { regs = mm_insert_reg(&r2, i, &n_regs, regs); diff --git a/format.c b/format.c index 02cf3be..144005a 100644 --- a/format.c +++ b/format.c @@ -119,7 +119,11 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score); if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split); - if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); + if (r->p) { + mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); + if (r->p->trans_strand == 1 || r->p->trans_strand == 2) + mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]); + } } void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres) diff --git a/main.c b/main.c index f387f2d..6a084fb 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r296-dirty" +#define MM_VERSION "2.0-r297-dirty" void liftrlimit() { @@ -112,6 +112,7 @@ int main(int argc, char *argv[]) return 0; } else if (c == 'u') { if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; + else if (*optarg == 'B') opt.flag |= MM_F_SPLICE_BOTH; else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); @@ -153,7 +154,7 @@ int main(int argc, char *argv[]) opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; - opt.noncan = 4; + opt.noncan = 5; opt.zdrop = 200; } else { fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg); diff --git a/map.c b/map.c index f4ea657..f8654b2 100644 --- a/map.c +++ b/map.c @@ -38,6 +38,8 @@ void mm_mapopt_init(mm_mapopt_t *opt) void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) { + if (opt->flag & MM_F_SPLICE_BOTH) + opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac); opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac); if (mm_verbose >= 3) diff --git a/minimap.h b/minimap.h index bce2448..c09b846 100644 --- a/minimap.h +++ b/minimap.h @@ -17,6 +17,7 @@ #define MM_F_SPLICE 0x080 #define MM_F_SPLICE_FOR 0x100 #define MM_F_SPLICE_REV 0x200 +#define MM_F_SPLICE_BOTH 0x400 #define MM_IDX_MAGIC "MMI\2" @@ -58,7 +59,8 @@ typedef struct { uint32_t capacity; int32_t dp_score, dp_max, dp_max2; uint32_t blen; - uint32_t n_diff, n_ambi; + uint32_t n_diff; + uint32_t n_ambi:30, trans_strand:2; uint32_t n_cigar; uint32_t cigar[]; } mm_extra_t; diff --git a/mmpriv.h b/mmpriv.h index 6f69467..83cfcf4 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -62,7 +62,8 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc); static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron) { - return q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX; + int32_t x = q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX; + return x > 4? x : 4; } #endif From bbb37d95f207108acff6784b257171fdecfeb797 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 17 Aug 2017 23:34:09 +0800 Subject: [PATCH 25/39] support inserting RG lines --- format.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.c | 10 +++++---- map.c | 6 +---- minimap2.1 | 11 ++++++--- mmpriv.h | 2 ++ 5 files changed, 83 insertions(+), 12 deletions(-) diff --git a/format.c b/format.c index 02cf3be..f11a711 100644 --- a/format.c +++ b/format.c @@ -6,6 +6,8 @@ #include "kalloc.h" #include "mmpriv.h" +static char *mm_rg_line, mm_rg_id[256]; + static inline void str_enlarge(kstring_t *s, int l) { if (s->l + l + 1 > s->m) { @@ -56,6 +58,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } +static inline char *mm_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +void mm_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(mm_rg_id, 0, 256); + if (mm_rg_line) { + free(mm_rg_line); + mm_rg_line = 0; + } + if (s == 0) return; + if (strstr(s, "@RG") != s) { + if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; + } + if (strstr(s, "\t") != NULL) { + if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal characters -- replace with escaped tabs: \\t\n", __func__); + goto err_set_rg; + } + rg_line = strdup(s); + mm_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + mm_rg_line = rg_line; + return; + +err_set_rg: + free(rg_line); +} + static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) { extern unsigned char seq_nt4_table[256]; @@ -158,6 +214,14 @@ static char comp_tab[] = { 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 }; +void sam_write_sam_header(const mm_idx_t *idx) +{ + uint32_t i; + for (i = 0; i < idx->n_seq; ++i) + printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); + if (mm_rg_line) puts(mm_rg_line); +} + static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) { if (rev) { @@ -214,6 +278,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "*"); } write_tags(s, r); + if (mm_rg_line && mm_rg_id[0]) + mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id); if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist int i, n_sa = 0; // n_sa: number of SA fields for (i = 0; i < n_regs; ++i) diff --git a/main.c b/main.c index f387f2d..49910c3 100644 --- a/main.c +++ b/main.c @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -96,6 +96,7 @@ int main(int argc, char *argv[]) else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); + else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc @@ -194,14 +195,15 @@ int main(int argc, char *argv[]) fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(stderr, " Input/Output:\n"); - fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); + fprintf(stderr, " -Q don't output base quality in SAM\n"); + fprintf(stderr, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); fprintf(stderr, " -c output CIGAR in PAF\n"); - fprintf(stderr, " -S output the cs tag in PAF\n"); + fprintf(stderr, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n"); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(stderr, " -K NUM minibatch size [200M]\n"); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); - fprintf(stderr, " -V show version number\n"); + fprintf(stderr, " --version show version number\n"); fprintf(stderr, " Preset:\n"); fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n"); diff --git a/map.c b/map.c index f4ea657..7b41748 100644 --- a/map.c +++ b/map.c @@ -386,11 +386,7 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int if (pl.fp == 0) return -1; pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; - if (opt->flag & MM_F_OUT_SAM) { - uint32_t i; - for (i = 0; i < idx->n_seq; ++i) - printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); - } + if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); free(pl.str.s); mm_bseq_close(pl.fp); diff --git a/minimap2.1 b/minimap2.1 index 5b8aee3..d2d2530 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -219,13 +219,18 @@ no attempt to match GT-AG [n] Cost of non-canonical splicing sites [0]. .SS Input/output options .TP 10 -.B -Q -Ignore base quality in the input file. -.TP .B -a Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF by default. .TP +.B -Q +Ignore base quality in the input file. +.TP +.BI -R \ STR +SAM read group line in a format like +.B @RG\\\\tID:foo\\\\tSM:bar +[]. +.TP .B -c Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag. .TP diff --git a/mmpriv.h b/mmpriv.h index 6f69467..b4b2c90 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,6 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); +void mm_set_rg(const char *s); +void sam_write_sam_header(const mm_idx_t *idx); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); From 81cff972080d24124c7c96411c5bab5b6ae37606 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 17 Aug 2017 23:38:31 +0800 Subject: [PATCH 26/39] r299: support -h: output to stdout; return 0 --- main.c | 106 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/main.c b/main.c index 49910c3..25a8eb7 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r296-dirty" +#define MM_VERSION "2.0-r299-dirty" void liftrlimit() { @@ -33,6 +33,7 @@ static struct option long_options[] = { { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, { "cost-non-gt-ag", required_argument, 0, 0 }, + { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, @@ -62,13 +63,13 @@ int main(int argc, char *argv[]) uint64_t batch_size = 4000000000ULL; mm_bseq_file_t *fp = 0; char *fnw = 0, *s; - FILE *fpr = 0, *fpw = 0; + FILE *fpr = 0, *fpw = 0, *fp_help = stderr; liftrlimit(); mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -97,6 +98,7 @@ int main(int argc, char *argv[]) else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c + else if (c == 'h') fp_help = stdout; else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc @@ -166,55 +168,55 @@ int main(int argc, char *argv[]) if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0) opt.max_gap_ref = opt.bw = max_intron_len; - if (argc == optind) { - fprintf(stderr, "Usage: minimap2 [options] | [query.fa] [...]\n"); - fprintf(stderr, "Options:\n"); - fprintf(stderr, " Indexing:\n"); - fprintf(stderr, " -H use homopolymer-compressed k-mer\n"); - fprintf(stderr, " -k INT k-mer size (no larger than 28) [%d]\n", k); - fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); - fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); - fprintf(stderr, " -d FILE dump index to FILE []\n"); - fprintf(stderr, " Mapping:\n"); - fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); - fprintf(stderr, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); - fprintf(stderr, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); - fprintf(stderr, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); - fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); -// fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy - fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n"); - fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); - fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); - fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [200k]\n"); - fprintf(stderr, " Alignment:\n"); - fprintf(stderr, " -A INT matching score [%d]\n", opt.a); - fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); - fprintf(stderr, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); - fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); - fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop); - fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); - fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); - fprintf(stderr, " Input/Output:\n"); - fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); - fprintf(stderr, " -Q don't output base quality in SAM\n"); - fprintf(stderr, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); - fprintf(stderr, " -c output CIGAR in PAF\n"); - fprintf(stderr, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); - fprintf(stderr, " -K NUM minibatch size [200M]\n"); -// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); - fprintf(stderr, " --version show version number\n"); - fprintf(stderr, " Preset:\n"); - fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); - fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n"); - fprintf(stderr, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n"); - fprintf(stderr, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n"); - fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); - fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); - fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); - fprintf(stderr, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); - fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); - return 1; + if (argc == optind || fp_help == stdout) { + fprintf(fp_help, "Usage: minimap2 [options] | [query.fa] [...]\n"); + fprintf(fp_help, "Options:\n"); + fprintf(fp_help, " Indexing:\n"); + fprintf(fp_help, " -H use homopolymer-compressed k-mer\n"); + fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k); + fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n"); + fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n"); + fprintf(fp_help, " -d FILE dump index to FILE []\n"); + fprintf(fp_help, " Mapping:\n"); + fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); + fprintf(fp_help, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); + fprintf(fp_help, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); + fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); + fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); +// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy + fprintf(fp_help, " -X skip self and dual mappings (for the all-vs-all mode)\n"); + fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); + fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); + fprintf(fp_help, " -G NUM max intron length (only effective following -x splice) [200k]\n"); + fprintf(fp_help, " Alignment:\n"); + fprintf(fp_help, " -A INT matching score [%d]\n", opt.a); + fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b); + fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); + fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); + fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop); + fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); + fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); + fprintf(fp_help, " Input/Output:\n"); + fprintf(fp_help, " -a output in the SAM format (PAF by default)\n"); + fprintf(fp_help, " -Q don't output base quality in SAM\n"); + fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); + fprintf(fp_help, " -c output CIGAR in PAF\n"); + fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n"); + fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); + fprintf(fp_help, " -K NUM minibatch size [200M]\n"); +// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose); + fprintf(fp_help, " --version show version number\n"); + fprintf(fp_help, " Preset:\n"); + fprintf(fp_help, " -x STR preset (recommended to be applied before other options) []\n"); + fprintf(fp_help, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n"); + fprintf(fp_help, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n"); + fprintf(fp_help, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n"); + fprintf(fp_help, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); + fprintf(fp_help, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); + fprintf(fp_help, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); + fprintf(fp_help, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); + fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); + return fp_help == stdout? 0 : 1; } is_idx = mm_idx_is_idx(argv[optind]); From 993a2bb5210fca4e21a5a87d7d282c395d5b0cc1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 18 Aug 2017 15:31:15 +0800 Subject: [PATCH 27/39] r301: separate introns from deletions When an intron is adjacent to a deletion, the old code count both as introns, which lead to an inaccurate exon boundary. --- align.c | 41 +++++++++++++++++++---------------------- format.c | 20 ++++++-------------- ksw2.h | 6 ++++-- ksw2_extd2_sse.c | 4 ++-- ksw2_exts2_sse.c | 4 ++-- ksw2_extz2_sse.c | 4 ++-- main.c | 2 +- map.c | 9 +++------ mmpriv.h | 10 ++-------- 9 files changed, 41 insertions(+), 59 deletions(-) diff --git a/align.c b/align.c index 25744e9..ea7bd29 100644 --- a/align.c +++ b/align.c @@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c } else if (op == 1) { score -= q + e * len, j += len; if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; - } else if (op == 2) { + } else if (op == 2 || op == 3) { score -= q + e * len, i += len; if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1; } @@ -64,12 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) { uint32_t k, l, toff = 0, qoff = 0; - int32_t s = 0, max = 0, min_intron_len, n_gtag = 0, n_ctac = 0; - min_intron_len = mm_min_intron_len(q, e, q_intron); + int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0; if (p == 0) return; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; - if (op == 0) { + if (op == 0) { // match/mismatch for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; if (ct > 3 || cq > 3) ++p->n_ambi; @@ -79,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t else max = max > s? max : s; } toff += len, qoff += len, p->blen += len; - } else if (op == 1) { + } else if (op == 1) { // insertion int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; @@ -87,23 +86,21 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t p->n_ambi += n_ambi, p->n_diff += len - n_ambi; s -= q + e * len; if (s < 0) s = 0; - } else if (op == 2) { + } else if (op == 2) { // deletion int n_ambi = 0; - if (len < min_intron_len) { - for (l = 0; l < len; ++l) - if (tseq[toff + l] > 3) ++n_ambi; - toff += len, p->blen += len; - p->n_ambi += n_ambi, p->n_diff += len - n_ambi; - s -= q + e * len; - if (s < 0) s = 0; - } else { // intron - uint8_t b[4]; - b[0] = tseq[toff], b[1] = tseq[toff+1]; - b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1]; - if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag; - else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac; - toff += len, p->blen += len; - } + for (l = 0; l < len; ++l) + if (tseq[toff + l] > 3) ++n_ambi; + toff += len, p->blen += len; + p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + s -= q + e * len; + if (s < 0) s = 0; + } else if (op == 3) { // intron + uint8_t b[4]; + b[0] = tseq[toff], b[1] = tseq[toff+1]; + b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1]; + if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag; + else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac; + toff += len, p->blen += len; } } p->dp_max = max; @@ -337,7 +334,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs, re, tseq); mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez); + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez); if (ez->n_cigar > 0) mm_append_cigar(r, ez->n_cigar, ez->cigar); if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle. diff --git a/format.c b/format.c index 2d3af37..cf81291 100644 --- a/format.c +++ b/format.c @@ -182,7 +182,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) } } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -196,12 +196,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); - for (k = 0; k < r->p->n_cigar; ++k) { - int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (intron_thres > 0 && op == 2 && len >= intron_thres) - mm_sprintf_lite(s, "%dN", len); - else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); - } + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); } if (r->p && (opt_flag & MM_F_OUT_CS)) write_cs(km, s, mi, t, r); @@ -239,7 +235,7 @@ static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) } else str_copy(s, seq, seq + l); } -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres) +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs) { int flag = 0; s->l = 0; @@ -258,12 +254,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs; int clip_char = (flag&0x800)? 'H' : 'S'; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); - for (k = 0; k < r->p->n_cigar; ++k) { - int op = r->p->cigar[k]&0xf, len = r->p->cigar[k]>>4; - if (intron_thres > 0 && op == 2 && len >= intron_thres) - mm_sprintf_lite(s, "%dN", len); - else mm_sprintf_lite(s, "%d%c", len, "MID"[op]); - } + for (k = 0; k < r->p->n_cigar; ++k) + mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]); clip_len = r->rev? r->qs : t->l_seq - r->qe; if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char); } else mm_sprintf_lite(s, "*"); diff --git a/ksw2.h b/ksw2.h index 3811628..5e43970 100644 --- a/ksw2.h +++ b/ksw2.h @@ -111,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F} // bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E}) // bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F}) -static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_) +static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, + int *m_cigar_, int *n_cigar_, uint32_t **cigar_) { // p[] - lower 3 bits: which type gets the max; bit int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; uint32_t *cigar = *cigar_, tmp; @@ -132,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure if (force_state >= 0) state = force_state; if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion + else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion + else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion } if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index c7f5843..56cd8cd 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -365,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 0694fb3..6decd2d 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -348,9 +348,9 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index 04669b9..f21f184 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -278,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (with_cigar) { // backtrack int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); else if (ez->max_t >= 0 && ez->max_q >= 0) - ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); kfree(km, mem2); kfree(km, off); } } diff --git a/main.c b/main.c index d526a44..0735b91 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r297-dirty" +#define MM_VERSION "2.0-r301-dirty" void liftrlimit() { diff --git a/map.c b/map.c index 630c78a..5af4936 100644 --- a/map.c +++ b/map.c @@ -346,24 +346,21 @@ static void *worker_pipeline(void *shared, int step, void *in) void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; - int intron_thres = -1; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); - if (p->opt->flag & MM_F_SPLICE) - intron_thres = mm_min_intron_len(p->opt->q, p->opt->e, p->opt->q2); if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (p->opt->flag & MM_F_OUT_SAM) - mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i], intron_thres); + mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); else - mm_write_paf(&p->str, mi, t, r, km, p->opt->flag, intron_thres); + mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { - mm_write_sam(&p->str, 0, t, 0, 0, 0, 0); + mm_write_sam(&p->str, 0, t, 0, 0, 0); puts(p->str.s); } for (j = 0; j < s->n_reg[i]; ++j) free(s->reg[i][j].p); diff --git a/mmpriv.h b/mmpriv.h index becf5e4..874f9e2 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -42,8 +42,8 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_set_rg(const char *s); void sam_write_sam_header(const mm_idx_t *idx); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); -void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); +void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); @@ -62,10 +62,4 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc); } #endif -static inline int32_t mm_min_intron_len(int32_t q, int32_t e, int32_t q_intron) -{ - int32_t x = q_intron > q? (int)((float)(q_intron - q) / e + .999) : INT32_MAX; - return x > 4? x : 4; -} - #endif From 8058c85b72582a35f2b5868fc56f1af15d5bd6ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 18 Aug 2017 23:33:57 +0800 Subject: [PATCH 28/39] moved appendix to methods; detailed splice aln --- tex/minimap2.bib | 16 +++ tex/minimap2.tex | 311 +++++++++++++++++++++++++---------------------- 2 files changed, 180 insertions(+), 147 deletions(-) diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 75ace65..3ca3e85 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -179,3 +179,19 @@ note = {doi:10.1101/130633}, publisher = {Cold Spring Harbor Labs Journals}, journal = {bioRxiv}} + +@article{Gotoh:1982aa, + Author = {Gotoh, O}, + Journal = {J Mol Biol}, + Pages = {705-8}, + Title = {An improved algorithm for matching biological sequences}, + Volume = {162}, + Year = {1982}} + +@article{Altschul:1986aa, + Author = {Altschul, S F and Erickson, B W}, + Journal = {Bull Math Biol}, + Pages = {603-16}, + Title = {Optimal sequence alignment using affine gap costs}, + Volume = {48}, + Year = {1986}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 57f8a37..f66aa58 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -65,21 +65,21 @@ and the ability to produce detailed alignment. \subsection{Chaining} -%\subsubsection{Chaining} +\subsubsection{Chaining} An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors sorted by ending reference position $x$, let $f(i)$ be the maximal chaining score up to the $i$-th anchor in the list. $f(i)$ can be calculated with dynamic programming (DP): \begin{equation}\label{eq:chain} -f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\beta_c(j,i) \},w_i\big\} +f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\} \end{equation} -where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of -matching bases between the two anchors. $\beta_c(j,i)>0$ is the gap cost. It +where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of +matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the distance between two anchors is too large); otherwise \begin{equation}\label{eq:chain-gap} -\beta_c(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big) +\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big) \end{equation} In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot |l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with @@ -96,15 +96,15 @@ score after up to $h$ iterations. This approach reduces the average time to $O(h\cdot m)$. In practice, we can almost always find the optimal chain with $h=50$; even if the heuristic fails, the optimal chain is often close. -%\subsubsection{Backtracking} -For backtracking, let $P(i)$ be the index of the best predecessor of anchor -$i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ -otherwise. For each anchor $i$ in the descending order of $f(i)$, we apply -$P(\cdot)$ repeatedly to find its predecessor and mark each visited $i$ as -`used', until $P(i)=0$ or we reach an already `used' $i$. This way we find all -chains with no anchors used in more than one chains. +\subsubsection{Backtracking} +Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if +$f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each +anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to +find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we +reach an already `used' $i$. This way we find all chains with no anchors used +in more than one chains. -%\subsubsection{Identifying primary chains} +\subsubsection{Identifying primary chains} In the absence of copy number changes, each query segment should not be mapped to two places in the reference. However, chains found at the previous step may have significant or complete overlaps due to repeats in the reference. @@ -117,32 +117,134 @@ add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did not choose a more sophisticated data structure (e.g. range tree or k-d tree) because this step is not the performance bottleneck. -\subsection{Alignment} +\subsection{Aligning genomic DNA} -Minimap2 performs global alignment between adjacent anchors in a chain. It -adopted difference-based formulation to derive -alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization, -this formulation has two advantages. First, because each score in the DP matrix -is bounded by the gap cost and the maximum matching score, we can usually -achieve 16-way SSE vectorization regardless of the peak score of the -alignment. Second, filling the DP matrix along the diagonal, we can simplify -banded alignment, which is critical to performance. In practice, our -implementation is three times as fast as Parasail's 4-way -vectorization~\citep{Daily:2016aa} for global alignment. -Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, -but with a 1000bp band, it is considerably faster. When performing global -alignment between anchors, we expect the alignment to stay close to the -diagonal of the DP matrix. Banding is often applicable. +\subsubsection{Alignment with 2-piece affine gap cost} -Minimap2 uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}: +Minimap2 performs DP-based global alignment between adjacent anchors in a +chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}: \begin{equation}\label{eq:2-piece} \gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\} \end{equation} -On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this -cost function is concave. It applies cost $q+|l|\cdot e$ to gaps shorter than -$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies -$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover -longer insertions and deletions~(INDEL). +Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$. +On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps +shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies +$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover +longer insertions and deletions~(INDELs). + +The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is +\begin{equation}\label{eq:ae86} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ +\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} +\end{array}\right. +\end{equation} +where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query +base. It is a natural extension to the algorithm under affine gap +cost~\citep{Gotoh:1982aa,Altschul:1986aa}. + +\subsubsection{Suzuki's formulation} + +To efficiently align long sequences, minimap2 did not directly use +Eq.~(\ref{eq:ae86}) for alignment. It instead adopted a difference-based +formulation first proposed by \citet{Wu:1996aa} and later adapted by +\citet{Suzuki:2016} for affine gap cost. In case of 2-piece affine gap cost in +Eq.~(\ref{eq:2-piece}), define +\[ +\left\{\begin{array}{ll} +u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ +x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\ +y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij} +\end{array}\right. +\] +We can transform Eq.~(\ref{eq:ae86}) to +\begin{equation}\label{eq:suzuki} +\left\{\begin{array}{lll} +z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ +&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ +u_{ij}&=&z_{ij}-v_{i-1,j}\\ +v_{ij}&=&z_{ij}-u_{i,j-1}\\ +x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ +y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ +\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation} +where $z_{ij}$ is a temporary variable that does not need to be stored. +This is Suzuki's formulation for 2-piece affine gap cost. An important property +of this formulation is that all values are bounded. To see that, +\[ +x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e +\] +With $E_{ij}\le H_{ij}$, we have +\[ +-q-e\le x_{ij}\le\max\{-q,0\}-e=-e +\] +and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$. +In addition, +\[ +u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e +\] +As the maximum value of $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the maximal +matching score, we can derive +\[ +u_{ij}\le M-v_{i-1,j}\le M+q+e +\] +In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$, +$\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and +$v$ by $[-q-e,M+q+e]$. When $-128\le-q-e\tilde{e}$, the boundary +condition of this equation in the diagonal-anti-diagonal coordinate is +\[ +\left\{\begin{array}{l} +x_{r-1,-1}=y_{r-1,r}=-q-e\\ +\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\ +u_{r-1,r}=v_{r-1,-1}=\eta(r)\\ +\end{array}\right. +\] +where +\[ +\eta(r)=\left\{\begin{array}{ll} +-q-e & (r=0) \\ +-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) +\end{array}\right. +\] + +In practice, our 16-way vectorized implementation of global alignment is three +times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without +banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with +a 1000bp band, it is considerably faster. When performing global alignment +between anchors, we expect the alignment to stay close to the diagonal of the +DP matrix. Banding is applicable most of time. + +\subsubsection{The Z-drop heuristic} With global alignment, minimap2 may force to align unrelated sequences between two adjacent anchors. To avoid such an artifact, we compute accumulative @@ -164,7 +266,7 @@ alignment between the two subsequences involved in the global alignment, but this time with the one subsequence reverse complemented. This additional alignment step may identify short inversions that are missed during chaining. -\subsection{Spliced alignment} +\subsection{Aligning spliced sequences} The algorithm described above can be adapted to spliced alignment. In this mode, the chaining gap cost distinguishes insertions to and deletions from the @@ -183,10 +285,33 @@ q+l\cdot e & (l>0) \\ \end{array}\right. \] In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is -regarded as an intron, which pays no cost to gap extensions. With these -modifications, minimap2 can retain multiple reasonably long introns in one -alignment, rather than fragment the alignment into local hits, which often -leads to the loss of small exons especially given noisy reads. +regarded as an intron, which pays no cost to gap extensions. + +To pinpoint precise exon boundaries, minimap2 penalizes non-canonical splicing +with the following equation +\begin{equation}\label{eq:splice} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a_i\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-d_i-\tilde{q},\tilde{E}_{ij}\}\\ +\end{array}\right. +\end{equation} +Let $T$ be the reference sequence. $d_i$ is the cost of a non-canonical donor +site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a postive number $p$ +otherwise. Similarly, $a_i$ is the cost of a non-canonical acceptor site, which +takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. When the read strand relative +to the underlying transcript is unknown, minimap2 aligns each chain twice, first +assuming ${\tt GT}$--${\tt AG}$ as the splice signal and then assuming ${\tt +CT}$--${\tt AC}$, the reverse complement of ${\tt GT}$--${\tt AG}$, as the +splice signal. The alignment with a higher score is taken as the final +alignment. This procedure also infers the relative strand of reads spanning +canonical splicing sites. + +In the spliced alignment mode, minimap2 further increases the density of +minimizers and disables banded alignment. Together with the two-round DP-based +alignment, spliced alignment is several times slower than DNA sequence +alignment. \end{methods} @@ -258,112 +383,4 @@ issues. \bibliography{minimap2} -\appendix - -\begin{methods} -\section*{Appendix} -A 2-piece gap cost function is -\[ -\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\} -\] -Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation -to compute the optimal alignment under such a gap cost is~\citep{Gotoh:1990aa} -\begin{equation}\label{eq:ae86} -\left\{\begin{array}{l} -H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ -E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ -F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ -\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ -\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} -\end{array}\right. -\end{equation} -where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query -base. If we define -\[ -\left\{\begin{array}{ll} -u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ -x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\ -y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij} -\end{array}\right. -\] -we can transform Eq.~(\ref{eq:ae86}) to -\begin{equation}\label{eq:suzuki} -\left\{\begin{array}{lll} -z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ -&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ -u_{ij}&=&z_{ij}-v_{i-1,j}\\ -v_{ij}&=&z_{ij}-u_{i,j-1}\\ -x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ -y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ -\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ -\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} -\end{array}\right. -\end{equation} -where $z_{ij}$ is a temporary variable that does not need to be stored. - -All values in Eq.~(\ref{eq:suzuki}) are bounded. To see that, -\[ -x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e -\] -With $E_{ij}\le H_{ij}$, we have -\[ --q-e\le x_{ij}\le\max\{-q,0\}-e=-e -\] -and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$. -In addition, -\[ -u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e -\] -As the maximum value of $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the maximal -matching score, we can derive -\[ -u_{ij}\le M-v_{i-1,j}\le M+q+e -\] -In conclusion, $x$ and $y$ are bounded by $[-q-e,-e]$, $\tilde{x}$ and $\tilde{y}$ by -$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When -matching score and gap cost are small, each of them can be stored as a 8-bit -integer. This enables 16-way SSE vectorization regardless of the peak score of -the alignment. - -For a more efficient SSE implementation, we transform the row-column coordinate -to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$. -Eq.~(\ref{eq:suzuki}) becomes: -\begin{equation*} -\left\{\begin{array}{lll} -z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\ -&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\ -u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ -v_{rt}&=&z_{rt}-u_{r-1,t}\\ -x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\ -y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ -\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ -\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e} -\end{array}\right. -\end{equation*} -In this formulation, cells with the same diagonal index $r$ are independent of -each other. This allows us to fully vectorize the computation of all cells on -the same anti-diagonal in one inner loop. - -On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary -condition of this equation in the diagonal-anti-diagonal coordinate is -\[ -\left\{\begin{array}{l} -x_{r-1,-1}=y_{r-1,r}=-q-e\\ -\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\ -u_{r-1,r}=v_{r-1,-1}=\eta(r)\\ -\end{array}\right. -\] -where -\[ -\eta(r)=\left\{\begin{array}{ll} --q-e & (r=0) \\ --e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ -r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ --\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) -\end{array}\right. -\] - -\citet{Suzuki:2016} first derived a similar set of equations under affine gap -cost but with different notations. -\end{methods} \end{document} From 5ed2faf2701242bda1e896f374763fb035c12526 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 19 Aug 2017 16:21:49 +0800 Subject: [PATCH 29/39] minor tuning to the matching rules --- misc/exon-eval.js | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/misc/exon-eval.js b/misc/exon-eval.js index e6a9871..eb963e4 100644 --- a/misc/exon-eval.js +++ b/misc/exon-eval.js @@ -123,7 +123,7 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 10, min_ov_ratio = 0.95, print_ovlp = false, print_err_only = false, first_only = false; +var c, l_fuzzy = 0, min_ov_ratio = 0.95, print_ovlp = false, print_err_only = false, first_only = false; while ((c = getopt(arguments, "l:r:ep1")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); @@ -168,7 +168,7 @@ var n_ext_hit = 0, n_int_hit = 0, n_sgl_hit = 0; file = new File(arguments[getopt.ind+1]); var last_qname = null; -var re_cigar = /(\d+)([MIDNSH])/g; +var re_cigar = /(\d+)([MIDNSHX=])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); if (t[0].charAt(0) == '@') continue; @@ -190,7 +190,7 @@ while (file.readline(buf) >= 0) { exon.push([exon_st, en]); en += len; exon_st = en; - } else if (op == 'M' || op == 'D') en += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') en += len; } exon.push([exon_st, en]); n_exon += exon.length; @@ -212,22 +212,19 @@ while (file.readline(buf) >= 0) { var l1 = o[j][1] - o[j][0]; var min = l0 < l1? l0 : l1; var ov_ratio = ol / min; - if (ov_ratio >= min_ov_ratio) { - var st_diff = exon[i][0] - o[j][0]; - var en_diff = exon[i][1] - o[j][1]; - if (st_diff < 0) st_diff = -st_diff; - if (en_diff < 0) en_diff = -en_diff; - if (i == 0 && exon.length == 1) { - ++n_sgl_hit, hit = true; - } else if (i == 0) { - if (en_diff <= l_fuzzy) ++n_ext_hit, hit = true; - } else if (i == exon.length - 1) { - if (st_diff <= l_fuzzy) ++n_ext_hit, hit = true; - } else { - //if (en_diff <= l_fuzzy && st_diff <= l_fuzzy && ol / span >= min_ov_ratio) - if (en_diff + st_diff <= l_fuzzy || ol / span >= min_ov_ratio) - ++n_int_hit, hit = true; - } + var st_diff = exon[i][0] - o[j][0]; + var en_diff = exon[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; + if (i == 0 && exon.length == 1) { + if (ov_ratio >= min_ov_ratio) ++n_sgl_hit, hit = true; + } else if (i == 0) { + if (en_diff <= l_fuzzy) ++n_ext_hit, hit = true; + } else if (i == exon.length - 1) { + if (st_diff <= l_fuzzy) ++n_ext_hit, hit = true; + } else { + if (en_diff + st_diff <= l_fuzzy) + ++n_int_hit, hit = true; } if (hit) break; } From a4c41f64a518eb8655e408b25469bc5860879525 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 20 Aug 2017 21:54:35 +0800 Subject: [PATCH 30/39] the final version to be replaced by intron-eval --- misc/exon-eval.js | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/misc/exon-eval.js b/misc/exon-eval.js index eb963e4..2deb61c 100644 --- a/misc/exon-eval.js +++ b/misc/exon-eval.js @@ -123,10 +123,11 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 0, min_ov_ratio = 0.95, print_ovlp = false, print_err_only = false, first_only = false; -while ((c = getopt(arguments, "l:r:ep1")) != null) { +var c, l_fuzzy = 0, min_ov_ratio = 0.95, min_span_ratio = 0.9, print_ovlp = false, print_err_only = false, first_only = false; +while ((c = getopt(arguments, "l:r:s:ep1")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); + else if (c == 's') min_span_ratio = parseFloat(getopt.arg); else if (c == 'p') print_ovlp = true; else if (c == 'e') print_err_only = print_ovlp = true; else if (c == '1') first_only = true; @@ -163,7 +164,7 @@ for (var chr in anno) { Interval.index_end(e); } -var n_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0, n_pri = 0; +var n_novel = 0, n_int_novel = 0, n_ext_novel = 0, n_sgl_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0, n_int_exon = 0, n_pri = 0, n_sgl_exon = 0, n_ext_exon = 0; var n_ext_hit = 0, n_int_hit = 0, n_sgl_hit = 0; file = new File(arguments[getopt.ind+1]); @@ -194,9 +195,15 @@ while (file.readline(buf) >= 0) { } exon.push([exon_st, en]); n_exon += exon.length; + n_int_exon += exon.length > 2? exon.length - 2 : 0; + n_sgl_exon += exon.length == 1? 1 : 0; + n_ext_exon += exon.length > 1? 2 : 0; var chr = anno[t[2]]; if (chr == null) { n_novel += exon.length; + n_int_novel += exon.length > 2? exon.length - 2 : 0; + n_sgl_novel += exon.length == 1? 1 : 0; + n_ext_novel += exon.length > 1? 2 : 0; } else { for (var i = 0; i < exon.length; ++i) { var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]); @@ -219,11 +226,11 @@ while (file.readline(buf) >= 0) { if (i == 0 && exon.length == 1) { if (ov_ratio >= min_ov_ratio) ++n_sgl_hit, hit = true; } else if (i == 0) { - if (en_diff <= l_fuzzy) ++n_ext_hit, hit = true; + if (ov_ratio >= min_ov_ratio && en_diff <= l_fuzzy) ++n_ext_hit, hit = true; } else if (i == exon.length - 1) { - if (st_diff <= l_fuzzy) ++n_ext_hit, hit = true; + if (ov_ratio >= min_ov_ratio && st_diff <= l_fuzzy) ++n_ext_hit, hit = true; } else { - if (en_diff + st_diff <= l_fuzzy) + if (en_diff + st_diff <= l_fuzzy && ol / span >= min_span_ratio) ++n_int_hit, hit = true; } if (hit) break; @@ -241,6 +248,9 @@ while (file.readline(buf) >= 0) { } } else { ++n_novel; + if (i > 0 && i < exon.length - 1) ++n_int_novel; + if (exon.length > 1 && (i == 0 || i == exon.length - 1)) ++n_ext_novel; + if (exon.length == 1) ++n_sgl_novel; if (print_ovlp) print('N', t[0], i+1, t[2], exon[i][0], exon[i][1]); } @@ -259,4 +269,7 @@ if (!print_ovlp) { print("Number of mapped exons: " + n_exon); print("Number of novel exons: " + n_novel); print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); + print("Internal exons: " + n_int_novel + ", " + n_int_hit + " / " + n_int_exon + " = " + (n_int_hit / n_int_exon * 100).toFixed(2) + "%"); + print("External exons: " + n_ext_novel + ", " + n_ext_hit + " / " + n_ext_exon + " = " + (n_ext_hit / n_ext_exon * 100).toFixed(2) + "%"); + print("Singleton exons: " + n_sgl_novel + ", " + n_sgl_hit + " / " + n_sgl_exon + " = " + (n_sgl_hit / n_sgl_exon * 100).toFixed(2) + "%"); } From ce859dbe1c138a3a371c1daaf9808fc2c8a1a475 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 20 Aug 2017 21:54:59 +0800 Subject: [PATCH 31/39] evaluate introns this simplies the metrics --- misc/{exon-eval.js => intron-eval.js} | 125 ++++++++++++-------------- 1 file changed, 58 insertions(+), 67 deletions(-) rename misc/{exon-eval.js => intron-eval.js} (61%) diff --git a/misc/exon-eval.js b/misc/intron-eval.js similarity index 61% rename from misc/exon-eval.js rename to misc/intron-eval.js index 2deb61c..3505355 100644 --- a/misc/exon-eval.js +++ b/misc/intron-eval.js @@ -123,24 +123,21 @@ Interval.find_ovlp = function(a, st, en) * Main function * *****************/ -var c, l_fuzzy = 0, min_ov_ratio = 0.95, min_span_ratio = 0.9, print_ovlp = false, print_err_only = false, first_only = false; -while ((c = getopt(arguments, "l:r:s:ep1")) != null) { +var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false; +while ((c = getopt(arguments, "l:ep")) != null) { if (c == 'l') l_fuzzy = parseInt(getopt.arg); - else if (c == 'r') min_ov_ratio = parseFloat(getopt.arg); - else if (c == 's') min_span_ratio = parseFloat(getopt.arg); - else if (c == 'p') print_ovlp = true; else if (c == 'e') print_err_only = print_ovlp = true; - else if (c == '1') first_only = true; + else if (c == 'p') print_ovlp = true; } if (arguments.length - getopt.ind < 2) { - print("Usage: k8 cdna-eval.js [options] "); + print("Usage: k8 intron-eval.js [options] "); exit(1); } var file, buf = new Bytes(); -var anno = {}; +var tr = {}; file = new File(arguments[getopt.ind]); while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); @@ -148,13 +145,31 @@ while (file.readline(buf) >= 0) { if (t[2] != 'exon') continue; var st = parseInt(t[3]) - 1; var en = parseInt(t[4]); - if (anno[t[0]] == null) anno[t[0]] = []; - anno[t[0]].push([st, en]); + if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue; + var tid = m[1]; + if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []]; + tr[tid][4].push([st, en]); } file.close(); +var anno = {}; +for (var tid in tr) { + var t = tr[tid]; + Interval.sort(t[4]); + t[2] = t[4][0][0]; + t[3] = t[4][t[4].length - 1][1]; + if (anno[t[0]] == null) anno[t[0]] = []; + var s = t[4]; + for (var i = 0; i < s.length - 1; ++i) { + if (s[i][1] >= s[i+1][0]) throw Error("ERROR: wrong annotation!"); + anno[t[0]].push([s[i][1], s[i+1][0]]); + } +} +tr = null; + for (var chr in anno) { var e = anno[chr]; + if (e.length == 0) continue; Interval.sort(e); var k = 0; for (var i = 1; i < e.length; ++i) // dedup @@ -164,14 +179,15 @@ for (var chr in anno) { Interval.index_end(e); } -var n_novel = 0, n_int_novel = 0, n_ext_novel = 0, n_sgl_novel = 0, n_partial = 0, n_unmapped = 0, n_mapped = 0, n_exon = 0, n_int_exon = 0, n_pri = 0, n_sgl_exon = 0, n_ext_exon = 0; -var n_ext_hit = 0, n_int_hit = 0, n_sgl_hit = 0; +var n_pri = 0, n_unmapped = 0, n_mapped = 0; +var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0; file = new File(arguments[getopt.ind+1]); var last_qname = null; var re_cigar = /(\d+)([MIDNSHX=])/g; while (file.readline(buf) >= 0) { var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '@') continue; var flag = parseInt(t[1]); if (flag&0x100) continue; @@ -183,56 +199,34 @@ while (file.readline(buf) >= 0) { ++n_pri; if (last_qname != t[0]) ++n_mapped; } - var st = parseInt(t[3]) - 1, en = st, exon_st = st; - var exon = []; + + var pos = parseInt(t[3]) - 1, intron = []; while ((m = re_cigar.exec(t[5])) != null) { var len = parseInt(m[1]), op = m[2]; if (op == 'N') { - exon.push([exon_st, en]); - en += len; - exon_st = en; - } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') en += len; + intron.push([pos, pos + len]); + pos += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len; } - exon.push([exon_st, en]); - n_exon += exon.length; - n_int_exon += exon.length > 2? exon.length - 2 : 0; - n_sgl_exon += exon.length == 1? 1 : 0; - n_ext_exon += exon.length > 1? 2 : 0; + if (intron.length == 0) { + ++n_sgl; + continue; + } + n_splice += intron.length; + var chr = anno[t[2]]; - if (chr == null) { - n_novel += exon.length; - n_int_novel += exon.length > 2? exon.length - 2 : 0; - n_sgl_novel += exon.length == 1? 1 : 0; - n_ext_novel += exon.length > 1? 2 : 0; - } else { - for (var i = 0; i < exon.length; ++i) { - var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]); + if (chr != null) { + for (var i = 0; i < intron.length; ++i) { + var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]); if (o.length > 0) { var hit = false; for (var j = 0; j < o.length; ++j) { - var min_st = exon[i][0] < o[j][0]? exon[i][0] : o[j][0]; - var max_st = exon[i][0] > o[j][0]? exon[i][0] : o[j][0]; - var min_en = exon[i][1] < o[j][1]? exon[i][1] : o[j][1]; - var max_en = exon[i][1] > o[j][1]? exon[i][1] : o[j][1]; - var ol = min_en - max_st, span = max_en - min_st; - var l0 = exon[i][1] - exon[i][0]; - var l1 = o[j][1] - o[j][0]; - var min = l0 < l1? l0 : l1; - var ov_ratio = ol / min; - var st_diff = exon[i][0] - o[j][0]; - var en_diff = exon[i][1] - o[j][1]; + var st_diff = intron[i][0] - o[j][0]; + var en_diff = intron[i][1] - o[j][1]; if (st_diff < 0) st_diff = -st_diff; if (en_diff < 0) en_diff = -en_diff; - if (i == 0 && exon.length == 1) { - if (ov_ratio >= min_ov_ratio) ++n_sgl_hit, hit = true; - } else if (i == 0) { - if (ov_ratio >= min_ov_ratio && en_diff <= l_fuzzy) ++n_ext_hit, hit = true; - } else if (i == exon.length - 1) { - if (ov_ratio >= min_ov_ratio && st_diff <= l_fuzzy) ++n_ext_hit, hit = true; - } else { - if (en_diff + st_diff <= l_fuzzy && ol / span >= min_span_ratio) - ++n_int_hit, hit = true; - } + if (st_diff <= l_fuzzy && en_diff <= l_fuzzy) + ++n_splice_hit, hit = true; if (hit) break; } if (print_ovlp) { @@ -244,17 +238,16 @@ while (file.readline(buf) >= 0) { x += '(' + o[j][0] + "," + o[j][1] + ')'; } x += ']'; - print(type, t[0], i+1, t[2], exon[i][0], exon[i][1], x); + print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x); } } else { - ++n_novel; - if (i > 0 && i < exon.length - 1) ++n_int_novel; - if (exon.length > 1 && (i == 0 || i == exon.length - 1)) ++n_ext_novel; - if (exon.length == 1) ++n_sgl_novel; + ++n_splice_novel; if (print_ovlp) - print('N', t[0], i+1, t[2], exon[i][0], exon[i][1]); + print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]); } } + } else { + n_splice_novel += intron.length; } last_qname = t[0]; } @@ -263,13 +256,11 @@ file.close(); buf.destroy(); if (!print_ovlp) { - print("Number of unmapped reads: " + n_unmapped); - print("Number of mapped reads: " + n_mapped); - print("Number of primary alignments: " + n_pri); - print("Number of mapped exons: " + n_exon); - print("Number of novel exons: " + n_novel); - print("Number of correct exons: " + (n_ext_hit + n_int_hit + n_sgl_hit) + " (" + ((n_ext_hit + n_int_hit + n_sgl_hit) / n_exon * 100).toFixed(2) + "%)"); - print("Internal exons: " + n_int_novel + ", " + n_int_hit + " / " + n_int_exon + " = " + (n_int_hit / n_int_exon * 100).toFixed(2) + "%"); - print("External exons: " + n_ext_novel + ", " + n_ext_hit + " / " + n_ext_exon + " = " + (n_ext_hit / n_ext_exon * 100).toFixed(2) + "%"); - print("Singleton exons: " + n_sgl_novel + ", " + n_sgl_hit + " / " + n_sgl_exon + " = " + (n_sgl_hit / n_sgl_exon * 100).toFixed(2) + "%"); + print("# unmapped reads: " + n_unmapped); + print("# mapped reads: " + n_mapped); + print("# primary alignments: " + n_pri); + print("# singletons: " + n_sgl); + print("# predicted introns: " + n_splice); + print("# non-overlapping introns: " + n_splice_novel); + print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)"); } From 19e4b2aab0bbc01c646af1ca8ea37229e87447c6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 20 Aug 2017 22:10:03 +0800 Subject: [PATCH 32/39] backup --- tex/minimap2.bib | 32 +++++++++++++++++++++++++++++ tex/minimap2.tex | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 3ca3e85..53fd4e2 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -195,3 +195,35 @@ Title = {Optimal sequence alignment using affine gap costs}, Volume = {48}, Year = {1986}} + +@article{Wu:2005vn, + Author = {Wu, Thomas D and Watanabe, Colin K}, + Journal = {Bioinformatics}, + Pages = {1859-75}, + Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences}, + Volume = {21}, + Year = {2005}} + +@article{Iwata:2012aa, + Author = {Iwata, Hiroaki and Gotoh, Osamu}, + Journal = {Nucleic Acids Res}, + Pages = {e161}, + Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features}, + Volume = {40}, + Year = {2012}} + +@article{Dobin:2013kx, + Author = {Dobin, Alexander and others}, + Journal = {Bioinformatics}, + Pages = {15-21}, + Title = {{STAR}: ultrafast universal {RNA-seq} aligner}, + Volume = {29}, + Year = {2013}} + +@article{Byrne:2017aa, + Author = {Byrne, Ashley and others}, + Journal = {Nat Commun}, + Pages = {16027}, + Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells}, + Volume = {8}, + Year = {2017}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index f66aa58..1081763 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -316,6 +316,9 @@ alignment. \end{methods} \section{Results} + +\subsection{Aligning genomic reads} + \begin{figure}[!tb] \centering \includegraphics[width=.5\textwidth]{roc-color.pdf} @@ -358,6 +361,56 @@ $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into shorter gaps. Minimap2 does not have this issue. +\subsection{Aligning spliced reads} + +\begin{table}[!tb] +\processtable{Exon-level evaluation of 2D ONT reads from mouse} +{\footnotesize\label{tab:exon} +\begin{tabular}{p{3.1cm}rrrr} +\toprule +& GMAP & minimap2 & SpAln & STAR\\ +\midrule +Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\ +Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\ +\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\ +\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\ +\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\ +\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\ +\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\ +\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\ +\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\ +\botrule +\end{tabular} +}{Reads (AC:SRR5286960) were mapped to the primary assembly of mouse genome +GRCm38 with the following tools and command options: minimap2 (`-ax splice'); +GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS -S3'); +STARlong (according to +\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were +compared to the EnsEMBL gene annotation, release 89. A predicted intron +is \emph{novel} if it has no overlaps with any annotated introns. An intron +is \emph{exact} if it is identical to an annotated intron. An intron is +\emph{approximate} if both of its 5'- and 3'-end are within 10bp around an +annotated intron.} +\end{table} + +We evaluated minimap2 along with GMAP~(v2017-06-20; \citealp{Wu:2005vn}), +SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a; +\citealp{Dobin:2013kx}) on real RNA-seq reads~\citep{Byrne:2017aa}. +In general, minimap2 is more consistent with existing annotations +(Table~\ref{tab:exon}). It finds more annotated spliced exons and predicts +fewer novel exons. Most novel exons identified by GMAP and SpAln are +very short, partly because the two aligners implement special routines to +identify micro-exons. It should be possible to optimize GMAP and SpAln on this +data set to reduce such errors. On run time, minimap2 is over 40 times faster +than GMAP and SpAln. While STAR is close to minimap2 in speed, it does not work +well with noisy reads. + +We have also run aligners on the SIRV spkie-in control data (AC:SRR5286959; +\citealp{Byrne:2017aa}) where the truth is know. Minimap2 is still the most +accurate. 91.9\% of internal exons in the minimap2 alignment are exact. +The percentage increases to 97.4\% if we allow up to 10bp around the splicing +boundaries. The difference between the two percentage is mostly caused by + \section{Discussions} Minialign and minimap2 are fast because a) with chaining, they can quickly From b8bdac7e646c4ace94e4ba1d6e669f7f6dba8924 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 22 Aug 2017 17:45:01 +0800 Subject: [PATCH 33/39] backup --- tex/minimap2.bib | 24 ++++++++ tex/minimap2.tex | 156 ++++++++++++++++++++++++++++------------------- 2 files changed, 116 insertions(+), 64 deletions(-) diff --git a/tex/minimap2.bib b/tex/minimap2.bib index 53fd4e2..eb49c9c 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -227,3 +227,27 @@ Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells}, Volume = {8}, Year = {2017}} + +@article{Roberts:2004fv, + Author = {Roberts, Michael and others}, + Journal = {Bioinformatics}, + Pages = {3363-9}, + Title = {Reducing storage requirements for biological sequence comparison}, + Volume = {20}, + Year = {2004}} + +@article{Zhang:2006aa, + Author = {Zhang, Miao and Gish, Warren}, + Journal = {Bioinformatics}, + Pages = {13-20}, + Title = {Improved spliced alignment from an information theoretic approach}, + Volume = {22}, + Year = {2006}} + +@article{Li:2007aa, + Author = {Li, Heng and others}, + Journal = {BMC Bioinformatics}, + Pages = {349}, + Title = {A cross-species alignment tool {(CAT)}}, + Volume = {8}, + Year = {2007}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 1081763..7481206 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -20,7 +20,7 @@ \begin{document} \firstpage{1} -\title[Long DNA sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long DNA sequences} +\title[Aligning long nucleotide sequences with minimap2]{Minimap2: fast pairwise alignment for long nucleotide sequences} \author[Li]{Heng Li} \address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA} @@ -28,11 +28,14 @@ \begin{abstract} \section{Summary:} Minimap2 is a general-purpose mapper to align long noisy DNA -sequences against a large reference database. It targets query sequences of -1kb--100Mb in length with per-base divergence typically below 25\%. Minimap2 is -$\sim$30 times faster than many mainstream long-read aligners and achieves -higher accuracy on simulated data. It also employs concave gap cost and rescues -inversions for improved alignment around potential structural variations. +or mRNA sequences against a large reference database. It targets query +sequences of 1kb--100Mb in length with per-base divergence typically below +25\%. For DNA sequence reads, minimap2 is $\sim$30 times faster than many +mainstream long-read aligners and achieves higher accuracy on simulated data. +It also employs concave gap cost and rescues inversions for improved alignment +around potential structural variations. For real long RNA-seq reads, minimap2 +is $\sim$40 times faster than peers and produces alignment more consistent with +existing gene annotations. \section{Availability and implementation:} \href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2} @@ -53,15 +56,26 @@ easier to map than 100bp reads because we can more effectively skip repetitive regions, which are often the bottleneck of short-read alignment. We confirmed our speculation by achieving approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast -and novel algorithm on generating detailed alignment, which in turn inspired us -to develop minimap2 towards higher accuracy and more practical functionality. +and novel algorithm on generating base-level alignment, which in turn inspired +us to develop minimap2 towards higher accuracy and more practical +functionality. \begin{methods} \section{Methods} -Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar -indexing and seeding algorithms, and furthers it with more accurate chaining -and the ability to produce detailed alignment. +Minimap2 follows a typical seed-chain-align procedure as is used by most +full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the +reference sequences and indexes them in a hash table. Then for each query +sequence, minimap2 takes query minimizers as \emph{seeds}, finds matches to the +reference, and identifies sets of colinear seeds, which are called +\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic +programming (DP) to extend from the ends of chains and to close unseeded +regions between adjacent seeds in chains. + +Minimap2 uses indexing and seeding algorithms similar to +minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate +chaining, the ability to produce base-level alignment and the support of +spliced alignment. \subsection{Chaining} @@ -69,8 +83,7 @@ and the ability to produce detailed alignment. An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors sorted by ending reference position $x$, let $f(i)$ be the maximal chaining -score up to the $i$-th anchor in the list. $f(i)$ can be calculated with -dynamic programming (DP): +score up to the $i$-th anchor in the list. $f(i)$ can be calculated with DP: \begin{equation}\label{eq:chain} f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\} \end{equation} @@ -91,7 +104,7 @@ accelerate chaining. We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}), -we start from anchor $i-1$ and stop the evaluation if we cannot find a better +we start from anchor $i-1$ and stop the process if we cannot find a better score after up to $h$ iterations. This approach reduces the average time to $O(h\cdot m)$. In practice, we can almost always find the optimal chain with $h=50$; even if the heuristic fails, the optimal chain is often close. @@ -143,13 +156,13 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ \end{array}\right. \end{equation} where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query -base. It is a natural extension to the algorithm under affine gap -cost~\citep{Gotoh:1982aa,Altschul:1986aa}. +base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine +gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}. \subsubsection{Suzuki's formulation} -To efficiently align long sequences, minimap2 did not directly use -Eq.~(\ref{eq:ae86}) for alignment. It instead adopted a difference-based +To efficiently align long sequences, minimap2 did not directly implement +Eq.~(\ref{eq:ae86}). It instead adopted a difference-based formulation first proposed by \citet{Wu:1996aa} and later adapted by \citet{Suzuki:2016} for affine gap cost. In case of 2-piece affine gap cost in Eq.~(\ref{eq:2-piece}), define @@ -200,7 +213,7 @@ a 8-bit integer. This enables 16-way SSE vectorization regardless of the peak score of the alignment. For a more efficient SSE implementation, we transform the row-column coordinate -to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$. +to the diagonal-antidiagonal coordinate by letting $r\gets i+j$ and $t\gets i$. Eq.~(\ref{eq:suzuki}) becomes: \begin{equation*} \left\{\begin{array}{lll} @@ -219,7 +232,7 @@ each other. This allows us to fully vectorize the computation of all cells on the same anti-diagonal in one inner loop. On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary -condition of this equation in the diagonal-anti-diagonal coordinate is +condition of this equation in the diagonal-antidiagonal coordinate is \[ \left\{\begin{array}{l} x_{r-1,-1}=y_{r-1,r}=-q-e\\ @@ -285,28 +298,37 @@ q+l\cdot e & (l>0) \\ \end{array}\right. \] In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is -regarded as an intron, which pays no cost to gap extensions. - -To pinpoint precise exon boundaries, minimap2 penalizes non-canonical splicing -with the following equation +regarded as an intron, which pays no cost to gap extensions. Minimap2 further +introduces reference-dependent cost to penalize non-canonical splicing: \begin{equation}\label{eq:splice} \left\{\begin{array}{l} -H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a_i\}\\ +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\ E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ -\tilde{E}_{i+1,j}= \max\{H_{ij}-d_i-\tilde{q},\tilde{E}_{ij}\}\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\ \end{array}\right. \end{equation} -Let $T$ be the reference sequence. $d_i$ is the cost of a non-canonical donor +Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a postive number $p$ -otherwise. Similarly, $a_i$ is the cost of a non-canonical acceptor site, which -takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. When the read strand relative -to the underlying transcript is unknown, minimap2 aligns each chain twice, first -assuming ${\tt GT}$--${\tt AG}$ as the splice signal and then assuming ${\tt -CT}$--${\tt AC}$, the reverse complement of ${\tt GT}$--${\tt AG}$, as the -splice signal. The alignment with a higher score is taken as the final -alignment. This procedure also infers the relative strand of reads spanning -canonical splicing sites. +otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which +takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is +almost the same as the equation used by EXALIN~\citep{Zhang:2006aa} and +CAT~\citep{Li:2007aa} except that we allow insertions immediately followed by +deletions and vice versa; in addition, we use Suzuki's diagonal formulation in +actual implementation. + +%Given that $d_i$ and $a_i$ +%are a function of the reference sequence, it is possible to incorporate +%splicing signals with more sophisticated models, such as positional weight +%matrices. We have not tried this approach. + +If RNA-seq reads are not sequenced from stranded libraries, the read strand +relative to the underlying transcript is unknown. By default, minimap2 aligns +each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal +and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt +GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is +taken as the final alignment. This procedure also infers the relative strand of +reads that span canonical splicing sites. In the spliced alignment mode, minimap2 further increases the density of minimizers and disables banded alignment. Together with the two-round DP-based @@ -363,9 +385,21 @@ shorter gaps. Minimap2 does not have this issue. \subsection{Aligning spliced reads} +We first evaluated minimap2 on SIRV control data~(AC:SRR5286959; +\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916 +introns, 93.0\% of which are precise. We examined wrongly predicted introns and +found the majority were caused by clustered splicing signals (e.g. two adjacent +${\tt GT}$ sites). When INDEL sequencing errors are frequent, it is difficult +to found precise splicing sites in this case. If we allow up to 10bp distance +from true splicing sites, 98.4\% of aligned introns are approximately correct. +Given this observation, we might be able to improve boundary detection by +initializing $d(\cdot)$ and $a(\cdot)$ in Eq.~(\ref{eq:splice}) with +position-specific scoring matrices or more sophisticated models. We have +not tried this approach. + \begin{table}[!tb] -\processtable{Exon-level evaluation of 2D ONT reads from mouse} -{\footnotesize\label{tab:exon} +\processtable{Evaluation of splicing accuracy on 2D ONT reads} +{\footnotesize\label{tab:intron} \begin{tabular}{p{3.1cm}rrrr} \toprule & GMAP & minimap2 & SpAln & STAR\\ @@ -381,35 +415,28 @@ Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\ \% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\ \botrule \end{tabular} -}{Reads (AC:SRR5286960) were mapped to the primary assembly of mouse genome -GRCm38 with the following tools and command options: minimap2 (`-ax splice'); -GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS -S3'); -STARlong (according to +}{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse +genome GRCm38 with the following tools and command options: minimap2 (`-ax +splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS +-S3'); STARlong (according to \href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were compared to the EnsEMBL gene annotation, release 89. A predicted intron is \emph{novel} if it has no overlaps with any annotated introns. An intron is \emph{exact} if it is identical to an annotated intron. An intron is -\emph{approximate} if both of its 5'- and 3'-end are within 10bp around an -annotated intron.} +\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends +of an annotated intron.} \end{table} -We evaluated minimap2 along with GMAP~(v2017-06-20; \citealp{Wu:2005vn}), -SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a; -\citealp{Dobin:2013kx}) on real RNA-seq reads~\citep{Byrne:2017aa}. -In general, minimap2 is more consistent with existing annotations -(Table~\ref{tab:exon}). It finds more annotated spliced exons and predicts -fewer novel exons. Most novel exons identified by GMAP and SpAln are -very short, partly because the two aligners implement special routines to -identify micro-exons. It should be possible to optimize GMAP and SpAln on this -data set to reduce such errors. On run time, minimap2 is over 40 times faster -than GMAP and SpAln. While STAR is close to minimap2 in speed, it does not work -well with noisy reads. - -We have also run aligners on the SIRV spkie-in control data (AC:SRR5286959; -\citealp{Byrne:2017aa}) where the truth is know. Minimap2 is still the most -accurate. 91.9\% of internal exons in the minimap2 alignment are exact. -The percentage increases to 97.4\% if we allow up to 10bp around the splicing -boundaries. The difference between the two percentage is mostly caused by +We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20; +\citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and +STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more +consistent with existing annotations (Table~\ref{tab:intron}): it finds +more splicing with a higher percentage being exactly or approximately correct. +We noted that GMAP and SpAln have not been optimized for noisy reads. We have +tried different settings, but their developers should be able to improve the +accuracy further. On run time, minimap2 is over 40 times faster than GMAP and +SpAln. While STAR is close to minimap2 in speed, it does not work well with +noisy reads. \section{Discussions} @@ -421,11 +448,12 @@ further accelerate minimap2 with a few other tweaks such as adaptive banding~\citep{Suzuki130633} or incremental banding. In addition to reference-based read mapping, minimap2 inherits minimap's -ability to search against huge multi-species databases and to find read +functionality to search against huge multi-species databases and to find read overlaps. On a few test data sets, minimap2 appears to yield slightly better -miniasm assembly. Minimap2 can also align closely related genomes, though it -would benefit from more thorough evaluations. Genome alignment is an intricate -topic. +miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align closely related +genomes or different assemblies of the same species. However, full-genome +alignment is an intricate research topic. More thorough evaluations would be +necessary to justify the use of minimap2 for such applications. \section*{Acknowledgements} We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and From 240f6caaff8848320d00fb55103c1a26ea2ac3cc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Aug 2017 22:05:14 +0800 Subject: [PATCH 34/39] backup manuscript --- misc/mapstat.js | 2 +- tex/minimap2.bib | 8 ++ tex/minimap2.tex | 199 ++++++++++++++++++++++++++++------------------- 3 files changed, 129 insertions(+), 80 deletions(-) diff --git a/misc/mapstat.js b/misc/mapstat.js index 2f80bb4..98e42c3 100644 --- a/misc/mapstat.js +++ b/misc/mapstat.js @@ -94,7 +94,7 @@ while (file.readline(buf) >= 0) { ori_qlen = parseInt(t[1]); } else { // SAM var flag = parseInt(t[1]); - if (flag & 4) continue; + if ((flag & 4) || t[2] == '*' || t[5] == '*') continue; if (flag & 0x100) { ++n_2nd; continue; diff --git a/tex/minimap2.bib b/tex/minimap2.bib index eb49c9c..f6f03ec 100644 --- a/tex/minimap2.bib +++ b/tex/minimap2.bib @@ -251,3 +251,11 @@ Title = {A cross-species alignment tool {(CAT)}}, Volume = {8}, Year = {2007}} + +@article{Farrar:2007hs, + Author = {Farrar, Michael}, + Journal = {Bioinformatics}, + Pages = {156-61}, + Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}}, + Volume = {23}, + Year = {2007}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 7481206..9036a27 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -49,16 +49,24 @@ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore technologies (ONT) produce reads over 10kbp in length at an error rate $\sim$15\%. Several aligners have been developed for such data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}. -They are usually five times as slow as mainstream short-read -aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be -substantial room for speedup on the thought that 10kb long sequences should be -easier to map than 100bp reads because we can more effectively skip repetitive -regions, which are often the bottleneck of short-read alignment. We confirmed -our speculation by achieving approximate mapping 50 times faster than -BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast -and novel algorithm on generating base-level alignment, which in turn inspired -us to develop minimap2 towards higher accuracy and more practical -functionality. +Most of them were five times as slow as mainstream short-read +aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases +mapped per second. We speculated there could be substantial room for speedup on +the thought that 10kb long sequences should be easier to map than 100bp reads +because we can more effectively skip repetitive regions, which are often the +bottleneck of short-read alignment. We confirmed our speculation by achieving +approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}. +\citet{Suzuki:2016} extended our work with a fast and novel algorithm on +generating base-level alignment, which in turn inspired us to develop minimap2 +towards higher accuracy and more practical functionality. + +Both SMRT and ONT have been applied to sequence spliced mRNAs (RNA-seq). While +traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not +optimized for long noisy sequence reads and are tens of times slower than +dedicated long-read aligners. When developing minimap2 initially for aligning +genomic DNA only, we realized minor modifications could make it competitive for +aligning mRNAs as well. Minimap2 is a first RNA-seq aligner specifically +designed for long noisy reads. \begin{methods} \section{Methods} @@ -83,7 +91,8 @@ spliced alignment. An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors sorted by ending reference position $x$, let $f(i)$ be the maximal chaining -score up to the $i$-th anchor in the list. $f(i)$ can be calculated with DP: +score up to the $i$-th anchor in the list. $f(i)$ can be calculated with +dynamic programming: \begin{equation}\label{eq:chain} f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\} \end{equation} @@ -161,11 +170,15 @@ gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}. \subsubsection{Suzuki's formulation} -To efficiently align long sequences, minimap2 did not directly implement -Eq.~(\ref{eq:ae86}). It instead adopted a difference-based -formulation first proposed by \citet{Wu:1996aa} and later adapted by -\citet{Suzuki:2016} for affine gap cost. In case of 2-piece affine gap cost in -Eq.~(\ref{eq:2-piece}), define +When we allow gaps longer than several hundred base pairs, nucleotide-level +alignment is much slower than chaining. SSE acceleration is critical to the +performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs} +based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short +sequences, but only 4-way parallelization when the peak alignment score reaches +32767. Long sequence alignment may exceed this threshold. Inspired by +\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a +difference-based formulation that lifted this limitation. In case of 2-piece +gap cost, define \[ \left\{\begin{array}{ll} u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ @@ -186,9 +199,9 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ \tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} \end{array}\right. \end{equation} -where $z_{ij}$ is a temporary variable that does not need to be stored. -This is Suzuki's formulation for 2-piece affine gap cost. An important property -of this formulation is that all values are bounded. To see that, +where $z_{ij}$ is a temporary variable that does not need to be stored. An +important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded. To +see that, \[ x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e \] @@ -229,10 +242,11 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ \end{equation*} In this formulation, cells with the same diagonal index $r$ are independent of each other. This allows us to fully vectorize the computation of all cells on -the same anti-diagonal in one inner loop. +the same anti-diagonal in one inner loop. It also simplifies banded alignment, +which would be difficult with striped vectorization~\citep{Farrar:2007hs}. On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary -condition of this equation in the diagonal-antidiagonal coordinate is +condition of the equation above is \[ \left\{\begin{array}{l} x_{r-1,-1}=y_{r-1,r}=-q-e\\ @@ -249,6 +263,7 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \end{array}\right. \] +These can be derived from the initial conditions of Eq.~(\ref{eq:ae86}). In practice, our 16-way vectorized implementation of global alignment is three times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without @@ -298,8 +313,10 @@ q+l\cdot e & (l>0) \\ \end{array}\right. \] In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is -regarded as an intron, which pays no cost to gap extensions. Minimap2 further -introduces reference-dependent cost to penalize non-canonical splicing: +regarded as an intron, which pays no cost to gap extensions. + +To pinpoint precise splicing junctions, minimap2 introduces reference-dependent +cost to penalize non-canonical splicing: \begin{equation}\label{eq:splice} \left\{\begin{array}{l} H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\ @@ -312,10 +329,9 @@ Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a postive number $p$ otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is -almost the same as the equation used by EXALIN~\citep{Zhang:2006aa} and -CAT~\citep{Li:2007aa} except that we allow insertions immediately followed by -deletions and vice versa; in addition, we use Suzuki's diagonal formulation in -actual implementation. +almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except +that we allow insertions immediately followed by deletions and vice versa; in +addition, we use Suzuki's diagonal formulation in actual implementation. %Given that $d_i$ and $a_i$ %are a function of the reference sequence, it is possible to incorporate @@ -345,60 +361,66 @@ alignment. \centering \includegraphics[width=.5\textwidth]{roc-color.pdf} \caption{Evaluation on simulated SMRT reads aligned against human genome -GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function -of mapping quality. 33,088 $\ge$1000bp reads were simulated using -pbsim~\citep{Ono:2013aa} with error profile sampled from file -`m131017\_060208\_42213\_*.1.*' downloaded at -\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is -11,628. A read is considered correctly mapped if the true position overlaps +GRCh38. (a) ROC-like curve. Alignments are sorted by mapping quality in the +descending order. For each mapping quality threshold, the fraction of +alignments with mapping quality above the threshold and their error rate +are plotted. (b) Accumulative mapping error rate as a function of mapping +quality. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa} +with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded +at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length +is 11,628. A read is considered correctly mapped if the true position overlaps with the best mapping position by 10\% of the read length. All aligners were -run under the default setting for SMRT reads.}\label{fig:eval} +run under the default setting for SMRT reads. Kart outputted all alignments at +mapping quality 60, so is not shown in the figure. It mapped nearly all reads +with 4.1\% of alignments being wrong, less accurate than others.}\label{fig:eval} \end{figure} As a sanity check, we evaluated minimap2 on simulated human reads along with -BLASR~\citep{Chaisson:2012aa}, -BWA-MEM~\citep{Li:2013aa}, -GraphMap~\citep{Sovic:2016aa}, -minialign~\citep{Suzuki:2016} and -NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab}, -LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either +BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}), +BWA-MEM~(v0.7.15; \citealp{Li:2013aa}), +GraphMap~(v0.5.2; \citealp{Sovic:2016aa}), +Kart~(v2.2.5; \citealp{Lin:2017aa}), +minialign~(v0.5.3; \citealp{Suzuki:2016}) and +NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab} +and LAMSA~\citep{Liu:2017aa} because they either crashed or produced malformatted output. In this evaluation, Minimap2 has higher power to distinguish unique and repetitive hits, and achieves overall higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate -even if we skip DP-based alignment (data not shown), suggesting chaining alone +even if we skip DP-based alignment (data not shown), confirming chaining alone is sufficient to achieve high accuracy for approximate mapping. Minimap2 and NGMLR provide better mapping quality estimate: they rarely give repetitive hits high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may occasionally miss close suboptimal hits and be overconfident in wrong mappings. -On run time, minialign is slightly faster than minimap2. They are over 30 times -faster than the rest. Minimap2 consumed 6.1GB memory at the peak, more than -BWA-MEM but less than others. +On run time, minialign is slightly faster than minimap2 and Kart. They are over +30 times faster than the rest. Minimap2 consumed 6.1GB memory at the peak, +more than BWA-MEM but less than others. On real human SMRT reads, the relative performance and sensitivity of -these aligners are broadly similar to those on simulated data. We are unable to -provide a good estimate of mapping error rate due to the lack of the truth. On -ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and -minimap2 are over 70 times faster than others. We have also examined tens of -$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the -observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into -shorter gaps. Minimap2 does not have this issue. +these aligners are broadly similar to the metrics on simulated data. We are +unable to provide a good estimate of mapping error rate due to the lack of the +truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed. +Kart, minialign and minimap2 are over 70 times faster than others. We have also +examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can +confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks +them into shorter gaps. The issue is much alleviated with minimap2, thanks +to the 2-piece affine gap cost. \subsection{Aligning spliced reads} -We first evaluated minimap2 on SIRV control data~(AC:SRR5286959; +We evaluated minimap2 on SIRV control data~(AC:SRR5286959; \citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916 -introns, 93.0\% of which are precise. We examined wrongly predicted introns and -found the majority were caused by clustered splicing signals (e.g. two adjacent -${\tt GT}$ sites). When INDEL sequencing errors are frequent, it is difficult -to found precise splicing sites in this case. If we allow up to 10bp distance -from true splicing sites, 98.4\% of aligned introns are approximately correct. -Given this observation, we might be able to improve boundary detection by -initializing $d(\cdot)$ and $a(\cdot)$ in Eq.~(\ref{eq:splice}) with -position-specific scoring matrices or more sophisticated models. We have -not tried this approach. +introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined +wrongly predicted junctions and found the majority were caused by clustered +splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing +errors are frequent, it is difficult to find precise splicing sites in this +case. If we allow up to 10bp distance from true splicing sites, 98.4\% of +aligned introns are approximately correct. Given this observation, we might be +able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in +Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more +sophisticated models. We have not tried this approach. \begin{table}[!tb] -\processtable{Evaluation of splicing accuracy on 2D ONT reads} +\processtable{Evaluation of junction accuracy on 2D ONT reads} {\footnotesize\label{tab:intron} \begin{tabular}{p{3.1cm}rrrr} \toprule @@ -431,26 +453,45 @@ We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20; \citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more consistent with existing annotations (Table~\ref{tab:intron}): it finds -more splicing with a higher percentage being exactly or approximately correct. -We noted that GMAP and SpAln have not been optimized for noisy reads. We have -tried different settings, but their developers should be able to improve the -accuracy further. On run time, minimap2 is over 40 times faster than GMAP and -SpAln. While STAR is close to minimap2 in speed, it does not work well with -noisy reads. +more junctions with a higher percentage being exactly or approximately correct. +Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to +minimap2 in speed, it does not work well with noisy reads. We have also +evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain +from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation +is similar: minimap2 is faster at higher junction accuracy. -\section{Discussions} +We noted that GMAP and SpAln have not been optimized for noisy reads. We are +showing the best setting we have experimented, but their developers should be +able to improve their accuracy further. -Minialign and minimap2 are fast because a) with chaining, they can quickly -filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but -costly DP-based alignments; b) they implemented so far the fastest DP-based -alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to -further accelerate minimap2 with a few other tweaks such as adaptive -banding~\citep{Suzuki130633} or incremental banding. +%\begin{table}[!tb] +%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads} +%{\footnotesize +%\begin{tabular}{lrrrr} +%\toprule +%& GMAP & minimap2 & SpAln & STAR\\ +%\midrule +%Run time (CPU min) & & 243 & 2\,352 & 1\,647 \\ +%\# aligned reads & & 1\,123\,025 & 1\,094\,092 & 682\,452\\ +%\# chimeric alignments & & 33\,091 & 0 & 0\\ +%\# non-spliced alignments & & 339\,081 & 291\,447 & 272\,536\vspace{1em}\\ +%\# aligned introns & & 9\,071\,755 & 9\,208\,564 & 3\,029\,121 \\ +%\# novel introns & & 42\,773 & 82\,230 & 17\,791 \\ +%\% exact introns & & 94.9\% & 91.7\% & 84.7\% \\ +%\% approx. introns&& 96.9\% & 93.4\% & 93.8\% \\ +%\botrule +%\end{tabular} +%}{} +%\end{table} -In addition to reference-based read mapping, minimap2 inherits minimap's -functionality to search against huge multi-species databases and to find read -overlaps. On a few test data sets, minimap2 appears to yield slightly better -miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align closely related + +\section{Conclusion} + +Minimap2 is a fast, accurate and versatile aligner for long nucleotide +sequences. In addition to reference-based read mapping, minimap2 inherits +minimap's functionality to search against huge multi-species databases and to +find read overlaps. On a few test data sets, minimap2 appears to yield slightly +better miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align similar genomes or different assemblies of the same species. However, full-genome alignment is an intricate research topic. More thorough evaluations would be necessary to justify the use of minimap2 for such applications. From 0fe1a224abc4a4cba2911ae65d3e1aa8ade6b988 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 25 Aug 2017 10:35:58 +0800 Subject: [PATCH 35/39] r309: improved SAM header output --- align.c | 6 +++--- bseq.c | 4 +--- format.c | 46 +++++++++++++++++++++++++++------------------- index.c | 2 +- main.c | 26 ++++++++++++++++---------- map.c | 3 ++- minimap.h | 1 + minimap2.1 | 11 ++++++++--- mmpriv.h | 4 ++-- 9 files changed, 61 insertions(+), 42 deletions(-) diff --git a/align.c b/align.c index ea7bd29..84a7336 100644 --- a/align.c +++ b/align.c @@ -61,7 +61,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c return 0; } -static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron) +static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) { uint32_t k, l, toff = 0, qoff = 0; int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0; @@ -374,7 +374,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); + mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -417,7 +417,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i if (ez->n_cigar == 0) goto end_align1_inv; // should never be here mm_append_cigar(r_inv, ez->n_cigar, ez->cigar); r_inv->p->dp_score = ez->max; - mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0); + mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); r_inv->id = -1; r_inv->parent = MM_PARENT_UNSET; r_inv->inv = 1; diff --git a/bseq.c b/bseq.c index e3e576f..30d285a 100644 --- a/bseq.c +++ b/bseq.c @@ -8,7 +8,6 @@ KSEQ_INIT(gzFile, gzread) struct mm_bseq_file_s { - int is_eof; gzFile fp; kseq_t *ks; }; @@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int size += seqs[n++].l_seq; if (size >= chunk_size) break; } - if (size < chunk_size) fp->is_eof = 1; *n_ = n; return seqs; } int mm_bseq_eof(mm_bseq_file_t *fp) { - return fp->is_eof; + return ks_eof(fp->ks->f); } diff --git a/format.c b/format.c index cf81291..949c0cd 100644 --- a/format.c +++ b/format.c @@ -6,7 +6,7 @@ #include "kalloc.h" #include "mmpriv.h" -static char *mm_rg_line, mm_rg_id[256]; +static char mm_rg_id[256]; static inline void str_enlarge(kstring_t *s, int l) { @@ -58,15 +58,13 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } -static inline char *mm_escape(char *s) +static char *mm_escape(char *s) { char *p, *q; for (p = q = s; *p; ++p) { if (*p == '\\') { ++p; if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; else if (*p == '\\') *q++ = '\\'; } else *q++ = *p; } @@ -74,44 +72,56 @@ static inline char *mm_escape(char *s) return s; } -void mm_set_rg(const char *s) +static void sam_write_rg_line(kstring_t *str, const char *s) { char *p, *q, *r, *rg_line = 0; memset(mm_rg_id, 0, 256); - if (mm_rg_line) { - free(mm_rg_line); - mm_rg_line = 0; - } if (s == 0) return; if (strstr(s, "@RG") != s) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n"); goto err_set_rg; } if (strstr(s, "\t") != NULL) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal characters -- replace with escaped tabs: \\t\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal characters -- replace with escaped tabs: \\t\n"); goto err_set_rg; } rg_line = strdup(s); mm_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n"); goto err_set_rg; } p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); if (q - p + 1 > 256) { - if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n"); goto err_set_rg; } for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q) *r++ = *q; - mm_rg_line = rg_line; - return; + mm_sprintf_lite(str, "%s\n", rg_line); err_set_rg: free(rg_line); } +void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]) +{ + kstring_t str = {0,0,0}; + sam_write_rg_line(&str, rg); + mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2"); + if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver); + if (argc > 1) { + int i; + mm_sprintf_lite(&str, "\tCL:minimap2"); + for (i = 1; i < argc; ++i) + mm_sprintf_lite(&str, " %s", argv[i]); + } + mm_sprintf_lite(&str, "\n"); + fputs(str.s, stdout); + free(str.s); +} + static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) { extern unsigned char seq_nt4_table[256]; @@ -214,12 +224,11 @@ static char comp_tab[] = { 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 }; -void sam_write_sam_header(const mm_idx_t *idx) +void mm_write_sam_SQ(const mm_idx_t *idx) { uint32_t i; for (i = 0; i < idx->n_seq; ++i) printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len); - if (mm_rg_line) puts(mm_rg_line); } static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) @@ -274,8 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "*"); } write_tags(s, r); - if (mm_rg_line && mm_rg_id[0]) - mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id); + if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id); if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist int i, n_sa = 0; // n_sa: number of SA fields for (i = 0; i < n_regs; ++i) diff --git a/index.c b/index.c index c139ec5..dc58231 100644 --- a/index.c +++ b/index.c @@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in) mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name) { pipeline_t pl; + if (fp == 0 || mm_bseq_eof(fp)) return 0; memset(&pl, 0, sizeof(pipeline_t)); pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size; pl.keep_name = keep_name; pl.batch_size = batch_size; pl.fp = fp; - if (pl.fp == 0) return 0; pl.mi = mm_idx_init(w, k, b, is_hpc); kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); diff --git a/main.c b/main.c index 0735b91..b040520 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r301-dirty" +#define MM_VERSION "2.0-r309-dirty" void liftrlimit() { @@ -33,6 +33,7 @@ static struct option long_options[] = { { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, { "cost-non-gt-ag", required_argument, 0, 0 }, + { "no-sam-sq", no_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -58,11 +59,11 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { mm_mapopt_t opt; - int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0; + int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0; int minibatch_size = 200000000; uint64_t batch_size = 4000000000ULL; mm_bseq_file_t *fp = 0; - char *fnw = 0, *s; + char *fnw = 0, *rg = 0, *s; FILE *fpr = 0, *fpw = 0, *fp_help = stderr; liftrlimit(); @@ -97,7 +98,7 @@ int main(int argc, char *argv[]) else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); - else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c + else if (c == 'R') rg = optarg; else if (c == 'h') fp_help = stdout; else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname @@ -110,6 +111,7 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag + else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq else if (c == 'V') { puts(MM_VERSION); return 0; @@ -222,27 +224,31 @@ int main(int argc, char *argv[]) is_idx = mm_idx_is_idx(argv[optind]); if (is_idx < 0) { - fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]); + fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]); return 1; } if (!is_idx && fnw == 0 && argc - optind < 2) { - fprintf(stderr, "[E::%s] missing input: please specify a query file or option -d\n", __func__); + fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n"); return 1; } if (is_idx) fpr = fopen(argv[optind], "rb"); else fp = mm_bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); + if (opt.flag & MM_F_OUT_SAM) + mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv); for (;;) { - mm_idx_t *mi = 0; + mm_idx_t *mi; if (fpr) { mi = mm_idx_load(fpr); if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc)) - fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n", - __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); - } else if (!mm_bseq_eof(fp)) { + fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n"); + } else { mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name); } if (mi == 0) break; + ++n_idx_part; + if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ)) + fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n"); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); diff --git a/map.c b/map.c index 5af4936..f531efa 100644 --- a/map.c +++ b/map.c @@ -385,7 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int if (pl.fp == 0) return -1; pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; - if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx); + if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ)) + mm_write_sam_SQ(idx); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); free(pl.str.s); mm_bseq_close(pl.fp); diff --git a/minimap.h b/minimap.h index c09b846..e345915 100644 --- a/minimap.h +++ b/minimap.h @@ -18,6 +18,7 @@ #define MM_F_SPLICE_FOR 0x100 #define MM_F_SPLICE_REV 0x200 #define MM_F_SPLICE_BOTH 0x400 +#define MM_F_NO_SAM_SQ 0x800 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index d2d2530..d5535c0 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools" +.TH minimap2 1 "25 August 2017" "minimap2-2.0-r309-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -255,8 +255,13 @@ and use .BR -K500m . .TP -.B -V +.B --version Print version number to stdout +.TP +.B --no-sam-hdr +Don't output SAM header lines. Use this option if the index consists of +multiple parts; otherwise the SAM output is malformated due to internal header +lines. .SS Preset options .TP 10 .BI -x \ STR @@ -320,7 +325,7 @@ is that this preset is not using HPC minimizers. Long-read spliced alignment .RB ( -k15 .B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag -.BR 4 ). +.BR 5 ). In the splice mode, 1) long deletions are taken as introns and represented as the .RB ` N ' diff --git a/mmpriv.h b/mmpriv.h index 874f9e2..3c40c28 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,8 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_set_rg(const char *s); -void sam_write_sam_header(const mm_idx_t *idx); +void mm_write_sam_SQ(const mm_idx_t *idx); +void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); From 5cbd77665158e5da95086d51fedf33faaaa26943 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 25 Aug 2017 13:17:56 +0800 Subject: [PATCH 36/39] test data for spliced alignment --- test/q2.fa | 2 ++ test/t2.fa | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 test/q2.fa create mode 100644 test/t2.fa diff --git a/test/q2.fa b/test/q2.fa new file mode 100644 index 0000000..99d63a2 --- /dev/null +++ b/test/q2.fa @@ -0,0 +1,2 @@ +>q2 +GGACATCCCGATGGTGCAGTCCTACCTGTACGAAAGGAC diff --git a/test/t2.fa b/test/t2.fa new file mode 100644 index 0000000..8c1d30b --- /dev/null +++ b/test/t2.fa @@ -0,0 +1,2 @@ +>t2 +GGACATCCCGATGGTGCAGgtGCTATTAAAGGTTCGTTTGTTCAACGATTAAagTCCTACCTGTACGAAAGGAC From bf8246f8729e45be1f6a8d8cdcecb4979a67eb76 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 25 Aug 2017 13:35:55 +0800 Subject: [PATCH 37/39] Release minimap2-2.1-r311 --- NEWS.md | 25 +++++++++++++++++++++++++ main.c | 2 +- minimap2.1 | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index b8bdd62..5bc0d76 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,28 @@ +Release 2.1-r311 (25 August 2017) +--------------------------------- + +This release adds spliced alignment for long noisy RNA-seq reads. On a SMRT +Iso-Seq and a Oxford Nanopore data sets, minimap2 appears to outperform +traditional mRNA aligners. For DNA alignment, this release gives almost +identical output to v2.0. Other changes include: + + * Added option `-R` to set the read group header line in SAM. + + * Optionally output the `cs:Z` tag in PAF to encode both the query and the + reference sequences in the alignment. + + * Fixed an issue where DP alignment uses excessive memory. + +The minimap2 technical report has been updated with more details and the +evaluation of spliced alignment: + + * Li, H. (2017). Minimap2: fast pairwise alignment for long nucleotide + sequences. [arXiv:1708.01492v2](https://arxiv.org/abs/1708.01492v2). + +(2.1: 25 August 2017, r311) + + + Release 2.0-r275 (8 August 2017) -------------------------------- diff --git a/main.c b/main.c index b040520..7199a07 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r309-dirty" +#define MM_VERSION "2.1-r311" void liftrlimit() { diff --git a/minimap2.1 b/minimap2.1 index d5535c0..61a0bdb 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "25 August 2017" "minimap2-2.0-r309-dirty" "Bioinformatics tools" +.TH minimap2 1 "25 August 2017" "minimap2-2.1-r311" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences From 5f96d851a8f5f7e57c06efaa4f3f4b46c1d853ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 25 Aug 2017 14:11:54 +0800 Subject: [PATCH 38/39] added spliced alignment example --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 21f69f0..50bdc94 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ cd minimap2 && make ./minimap2 -ax map10k MT-human.mmi test/MT-orang.fa > test.sam # long-read overlap (no test data) ./minimap2 -x ava-pb your-reads.fa your-reads.fa > overlaps.paf +# spliced alignment (no test data) +./minimap2 -ax splice ref.fa rna-seq-reads.fa > spliced.sam # man page man ./minimap2.1 ``` From 26416136860b0b8b607b77ade54c94d2080e5891 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 25 Aug 2017 17:56:16 +0800 Subject: [PATCH 39/39] various minor improvements --- tex/minimap2.tex | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/tex/minimap2.tex b/tex/minimap2.tex index 9036a27..35090fb 100644 --- a/tex/minimap2.tex +++ b/tex/minimap2.tex @@ -177,8 +177,8 @@ based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short sequences, but only 4-way parallelization when the peak alignment score reaches 32767. Long sequence alignment may exceed this threshold. Inspired by \citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a -difference-based formulation that lifted this limitation. In case of 2-piece -gap cost, define +difference-based formulation that lifted this limitation. +In case of 2-piece gap cost, define \[ \left\{\begin{array}{ll} u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ @@ -199,9 +199,10 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ \tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} \end{array}\right. \end{equation} -where $z_{ij}$ is a temporary variable that does not need to be stored. An -important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded. To -see that, +where $z_{ij}$ is a temporary variable that does not need to be stored. + +An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded +by scoring parameters. To see that, \[ x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e \] @@ -245,8 +246,8 @@ each other. This allows us to fully vectorize the computation of all cells on the same anti-diagonal in one inner loop. It also simplifies banded alignment, which would be difficult with striped vectorization~\citep{Farrar:2007hs}. -On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary -condition of the equation above is +On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial +values in the diagonal-antidiagonal formuation is \[ \left\{\begin{array}{l} x_{r-1,-1}=y_{r-1,r}=-q-e\\ @@ -263,7 +264,7 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \end{array}\right. \] -These can be derived from the initial conditions of Eq.~(\ref{eq:ae86}). +These can be derived from the initial values for Eq.~(\ref{eq:ae86}). In practice, our 16-way vectorized implementation of global alignment is three times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without @@ -285,9 +286,9 @@ $j'Z+e\cdot|(i-i')-(j-j')| \] where $e$ is the gap extension cost and $Z$ is an arbitrary threshold. -This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}. -However, unlike X-drop, it would not break the alignment in the presence of a -single long gap. +This strategy is first used in BWA-MEM. It is similar to X-drop employed in +BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the +alignment in the presence of a single long gap. When minimap2 breaks a global alignment between two anchors, it performs local alignment between the two subsequences involved in the global alignment, but @@ -326,7 +327,7 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ \end{array}\right. \end{equation} Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor -site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a postive number $p$ +site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$ otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except @@ -361,18 +362,18 @@ alignment. \centering \includegraphics[width=.5\textwidth]{roc-color.pdf} \caption{Evaluation on simulated SMRT reads aligned against human genome -GRCh38. (a) ROC-like curve. Alignments are sorted by mapping quality in the -descending order. For each mapping quality threshold, the fraction of -alignments with mapping quality above the threshold and their error rate -are plotted. (b) Accumulative mapping error rate as a function of mapping -quality. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa} +GRCh38. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa} with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is 11,628. A read is considered correctly mapped if the true position overlaps with the best mapping position by 10\% of the read length. All aligners were -run under the default setting for SMRT reads. Kart outputted all alignments at -mapping quality 60, so is not shown in the figure. It mapped nearly all reads -with 4.1\% of alignments being wrong, less accurate than others.}\label{fig:eval} +run under the default setting for SMRT reads. (a) ROC-like curve. Alignments +are sorted by mapping quality in the descending order. For each mapping quality +threshold, the fraction of alignments with mapping quality above the threshold +and their error rate are plotted. Kart outputted all alignments at mapping +quality 60, so is not shown in the figure. It mapped nearly all reads with +4.1\% of alignments being wrong, less accurate than others. (b) Accumulative +mapping error rate as a function of mapping quality.}\label{fig:eval} \end{figure} As a sanity check, we evaluated minimap2 on simulated human reads along with @@ -383,7 +384,7 @@ Kart~(v2.2.5; \citealp{Lin:2017aa}), minialign~(v0.5.3; \citealp{Suzuki:2016}) and NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab} and LAMSA~\citep{Liu:2017aa} because they either -crashed or produced malformatted output. In this evaluation, Minimap2 has +crashed or produced malformatted output. In this evaluation, minimap2 has higher power to distinguish unique and repetitive hits, and achieves overall higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate even if we skip DP-based alignment (data not shown), confirming chaining alone @@ -500,8 +501,8 @@ necessary to justify the use of minimap2 for such applications. We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and insightful notes before formal publication. We thank M. Schatz, P. Rescheneder and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also -grateful to early minimap2 testers who have greatly helped to fix various -issues. +grateful to early minimap2 testers who have greatly helped to suggest features +and to fix various issues. \bibliography{minimap2}