diff --git a/Makefile b/Makefile
index 31eb857..f7892e8 100644
--- a/Makefile
+++ b/Makefile
@@ -2,8 +2,8 @@ CC=			gcc
 CFLAGS=		-g -Wall -O2 -Wc++-compat
 CPPFLAGS=	-DHAVE_KALLOC
 INCLUDES=	-I.
-OBJS=		kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_ll_sse.o misc.o bseq.o \
-			sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
+OBJS=		kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ksw2_ll_sse.o \
+			misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
 PROG=		minimap2
 PROG_EXTRA=	sdust minimap2-lite
 LIBS=		-lm -lz -lpthread
@@ -45,11 +45,12 @@ align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
 bseq.o: bseq.h kseq.h
 chain.o: minimap.h mmpriv.h bseq.h kalloc.h
 example.o: minimap.h kseq.h
-format.o: mmpriv.h minimap.h bseq.h
+format.o: kalloc.h mmpriv.h minimap.h bseq.h
 hit.o: mmpriv.h minimap.h bseq.h kalloc.h
 index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
 kalloc.o: kalloc.h
 ksw2_extd2_sse.o: ksw2.h kalloc.h
+ksw2_exts2_sse.o: ksw2.h kalloc.h
 ksw2_extz2_sse.o: ksw2.h kalloc.h
 ksw2_ll_sse.o: ksw2.h kalloc.h
 main.o: bseq.h minimap.h mmpriv.h
diff --git a/NEWS.md b/NEWS.md
index eb0c522..5bc0d76 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,41 @@
+Release 2.1-r311 (25 August 2017)
+---------------------------------
+
+This release adds spliced alignment for long noisy RNA-seq reads. On a SMRT
+Iso-Seq and a Oxford Nanopore data sets, minimap2 appears to outperform
+traditional mRNA aligners. For DNA alignment, this release gives almost
+identical output to v2.0. Other changes include:
+
+ * Added option `-R` to set the read group header line in SAM.
+
+ * Optionally output the `cs:Z` tag in PAF to encode both the query and the
+   reference sequences in the alignment.
+
+ * Fixed an issue where DP alignment uses excessive memory.
+
+The minimap2 technical report has been updated with more details and the
+evaluation of spliced alignment:
+
+ * Li, H. (2017). Minimap2: fast pairwise alignment for long nucleotide
+   sequences. [arXiv:1708.01492v2](https://arxiv.org/abs/1708.01492v2).
+
+(2.1: 25 August 2017, r311)
+
+
+
+Release 2.0-r275 (8 August 2017)
+--------------------------------
+
+This release is identical to version 2.0rc1, except the version number. It is
+described and evaluated in the following technical report:
+
+ * Li, H. (2017). Minimap2: fast pairwise alignment for long DNA sequences.
+   [arXiv:1708.01492v1](https://arxiv.org/abs/1708.01492v1).
+
+(2.0: 8 August 2017, r275)
+
+
+
 Release 2.0rc1-r232 (30 July 2017)
 ----------------------------------
 
diff --git a/README.md b/README.md
index 482ca9d..50bdc94 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2)
 ## Getting Started
 ```sh
 git clone https://github.com/lh3/minimap2
@@ -9,6 +10,8 @@ cd minimap2 && make
 ./minimap2 -ax map10k MT-human.mmi test/MT-orang.fa > test.sam
 # long-read overlap (no test data)
 ./minimap2 -x ava-pb your-reads.fa your-reads.fa > overlaps.paf
+# spliced alignment (no test data)
+./minimap2 -ax splice ref.fa rna-seq-reads.fa > spliced.sam
 # man page
 man ./minimap2.1
 ```
diff --git a/align.c b/align.c
index 595f0b1..84a7336 100644
--- a/align.c
+++ b/align.c
@@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
 		} else if (op == 1) {
 			score -= q + e * len, j += len;
 			if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
-		} else if (op == 2) {
+		} else if (op == 2 || op == 3) {
 			score -= q + e * len, i += len;
 			if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
 		}
@@ -64,11 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
 static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
 {
 	uint32_t k, l, toff = 0, qoff = 0;
-	int32_t s = 0, max = 0;
+	int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
 	if (p == 0) return;
 	for (k = 0; k < p->n_cigar; ++k) {
 		uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
-		if (op == 0) {
+		if (op == 0) { // match/mismatch
 			for (l = 0; l < len; ++l) {
 				int cq = qseq[qoff + l], ct = tseq[toff + l];
 				if (ct > 3 || cq > 3) ++p->n_ambi;
@@ -78,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
 				else max = max > s? max : s;
 			}
 			toff += len, qoff += len, p->blen += len;
-		} else if (op == 1) {
+		} else if (op == 1) { // insertion
 			int n_ambi = 0;
 			for (l = 0; l < len; ++l)
 				if (qseq[qoff + l] > 3) ++n_ambi;
@@ -86,7 +86,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
 			p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
 			s -= q + e * len;
 			if (s < 0) s = 0;
-		} else if (op == 2) {
+		} else if (op == 2) { // deletion
 			int n_ambi = 0;
 			for (l = 0; l < len; ++l)
 				if (tseq[toff + l] > 3) ++n_ambi;
@@ -94,9 +94,18 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
 			p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
 			s -= q + e * len;
 			if (s < 0) s = 0;
+		} else if (op == 3) { // intron
+			uint8_t b[4];
+			b[0] = tseq[toff], b[1] = tseq[toff+1];
+			b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
+			if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
+			else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
+			toff += len, p->blen += len;
 		}
 	}
 	p->dp_max = max;
+	if (n_gtag > n_ctac) p->trans_strand = 1;
+	else if (n_gtag < n_ctac) p->trans_strand = 2;
 }
 
 static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
@@ -132,7 +141,9 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
 		for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
 		for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
 	}
-	if (opt->q == opt->q2 && opt->e == opt->e2)
+	if (opt->flag & MM_F_SPLICE)
+		ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez);
+	else if (opt->q == opt->q2 && opt->e == opt->e2)
 		ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
 	else
 		ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez);
@@ -159,8 +170,8 @@ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2],
 		c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
 		*r = (int32_t)a->x + 1 - c;
 	} else {
-		*r = (int32_t)a->x + 1;
-		*q = (int32_t)a->y + 1;
+		*r = (int32_t)a->x - (mi->k>>1);
+		*q = (int32_t)a->y - (mi->k>>1);
 	}
 }
 
@@ -240,11 +251,11 @@ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int32_
 	}
 }
 
-static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez)
+static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez, int splice_flag)
 {
 	int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
 	uint8_t *tseq, *qseq;
-	int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0;
+	int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
 	int32_t rs, re, qs, qe;
 	int32_t rs1, qs1, re1, qe1;
 	int8_t mat[25];
@@ -254,11 +265,19 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 	bw = (int)(opt->bw * 1.5 + 1.);
 
 	r2->cnt = 0;
-	mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
+	if (!(opt->flag & MM_F_SPLICE))
+		mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
+	else as1 = r->as, cnt1 = r->cnt;
 	mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
 	mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
 	mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
 
+	if (opt->flag & MM_F_SPLICE) {
+		if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
+		if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
+		if (splice_flag & MM_F_SPLICE_BOTH) extra_flag |= KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV;
+	}
+
 	// compute rs0 and qs0
 	if (r->split && as1 > 0) {
 		mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0);
@@ -291,7 +310,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 		mm_idx_getseq(mi, rid, rs0, rs, tseq);
 		mm_seq_rev(qs - qs0, qseq);
 		mm_seq_rev(rs - rs0, tseq);
-		mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
+		mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
 		if (ez->n_cigar > 0) {
 			mm_append_cigar(r, ez->n_cigar, ez->cigar);
 			r->p->dp_score += ez->max;
@@ -313,9 +332,9 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 				bw1 = qe - qs > re - rs? qe - qs : re - rs;
 			qseq = &qseq0[rev][qs];
 			mm_idx_getseq(mi, rid, rs, re, tseq);
-			mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez);
+			mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
 			if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
-				mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
+				mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
 			if (ez->n_cigar > 0)
 				mm_append_cigar(r, ez->n_cigar, ez->cigar);
 			if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
@@ -338,7 +357,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 	if (!dropped && qe < qe0 && re < re0) { // right extension
 		qseq = &qseq0[rev][qe];
 		mm_idx_getseq(mi, rid, re, re0, tseq);
-		mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez);
+		mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
 		if (ez->n_cigar > 0) {
 			mm_append_cigar(r, ez->n_cigar, ez->cigar);
 			r->p->dp_score += ez->max;
@@ -356,6 +375,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
 	if (r->p) {
 		mm_idx_getseq(mi, rid, rs1, re1, tseq);
 		mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
+		if (rev && r->p->trans_strand)
+			r->p->trans_strand ^= 3; // flip to the read strand
 	}
 
 	kfree(km, tseq);
@@ -438,7 +459,28 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
 	memset(&ez, 0, sizeof(ksw_extz_t));
 	for (i = 0; i < n_regs; ++i) {
 		mm_reg1_t r2;
-		mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, a, &ez);
+		if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) {
+			mm_reg1_t s[2], s2[2];
+			int which, trans_strand;
+			s[0] = s[1] = regs[i];
+			mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], a, &ez, MM_F_SPLICE_FOR);
+			mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], a, &ez, MM_F_SPLICE_REV);
+			if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
+			else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
+			else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
+			if (which == 0) {
+				regs[i] = s[0], r2 = s2[0];
+				free(s[1].p);
+			} else {
+				regs[i] = s[1], r2 = s2[1];
+				free(s[0].p);
+			}
+			regs[i].p->trans_strand = trans_strand;
+		} else {
+			mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, a, &ez, opt->flag);
+			if ((opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SPLICE_BOTH))
+				regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
+		}
 		if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
 		if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
 			regs = mm_insert_reg(&r2, i, &n_regs, regs);
diff --git a/bseq.c b/bseq.c
index e3e576f..30d285a 100644
--- a/bseq.c
+++ b/bseq.c
@@ -8,7 +8,6 @@
 KSEQ_INIT(gzFile, gzread)
 
 struct mm_bseq_file_s {
-	int is_eof;
 	gzFile fp;
 	kseq_t *ks;
 };
@@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
 		size += seqs[n++].l_seq;
 		if (size >= chunk_size) break;
 	}
-	if (size < chunk_size) fp->is_eof = 1;
 	*n_ = n;
 	return seqs;
 }
 
 int mm_bseq_eof(mm_bseq_file_t *fp)
 {
-	return fp->is_eof;
+	return ks_eof(fp->ks->f);
 }
diff --git a/chain.c b/chain.c
index 1c72631..37c920a 100644
--- a/chain.c
+++ b/chain.c
@@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
 	return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
 }
 
-int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km)
+int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km)
 { // TODO: make sure this works when n has more than 32 bits
 	int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v;
 	int64_t i, j;
@@ -42,17 +42,23 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
 		uint64_t ri = a[i].x;
 		int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
 		int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX;
-		while (st < i && ri - a[st].x > max_dist) ++st;
+		while (st < i && ri - a[st].x > max_dist_x) ++st;
 		for (j = i - 1; j >= st; --j) {
 			int64_t dr = ri - a[j].x;
 			int32_t dq = qi - (int32_t)a[j].y, dd, sc;
-			if (dr == 0 || dq <= 0 || dq > max_dist) continue;
+			if (dr == 0 || dq <= 0 || dq > max_dist_y) continue;
 			dd = dr > dq? dr - dq : dq - dr;
 			if (dd > bw) continue;
 			max_f_past = max_f_past > f[j]? max_f_past : f[j];
 			min_d = dq < dr? dq : dr;
 			sc = min_d > q_span? q_span : dq < dr? dq : dr;
-			sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
+			if (is_cdna) {
+				int c_log, c_lin;
+				c_lin = (int)(dd * .01 * avg_qspan);
+				c_log = ilog2_32(dd);
+				if (dr > dq) sc -= c_lin < c_log? c_lin : c_log;
+				else sc -= c_lin + (c_log>>1);
+			} else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
 			sc += f[j];
 			if (sc > max_f) {
 				max_f = sc, max_j = j;
diff --git a/format.c b/format.c
index fd51c00..949c0cd 100644
--- a/format.c
+++ b/format.c
@@ -6,6 +6,8 @@
 #include "kalloc.h"
 #include "mmpriv.h"
 
+static char mm_rg_id[256];
+
 static inline void str_enlarge(kstring_t *s, int l)
 {
 	if (s->l + l + 1 > s->m) {
@@ -56,6 +58,70 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
 	s->s[s->l] = 0;
 }
 
+static char *mm_escape(char *s)
+{
+	char *p, *q;
+	for (p = q = s; *p; ++p) {
+		if (*p == '\\') {
+			++p;
+			if (*p == 't') *q++ = '\t';
+			else if (*p == '\\') *q++ = '\\';
+		} else *q++ = *p;
+	}
+	*q = '\0';
+	return s;
+}
+
+static void sam_write_rg_line(kstring_t *str, const char *s)
+{
+	char *p, *q, *r, *rg_line = 0;
+	memset(mm_rg_id, 0, 256);
+	if (s == 0) return;
+	if (strstr(s, "@RG") != s) {
+		if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
+		goto err_set_rg;
+	}
+	if (strstr(s, "\t") != NULL) {
+		if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
+		goto err_set_rg;
+	}
+	rg_line = strdup(s);
+	mm_escape(rg_line);
+	if ((p = strstr(rg_line, "\tID:")) == 0) {
+		if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
+		goto err_set_rg;
+	}
+	p += 4;
+	for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
+	if (q - p + 1 > 256) {
+		if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
+		goto err_set_rg;
+	}
+	for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
+		*r++ = *q;
+	mm_sprintf_lite(str, "%s\n", rg_line);
+
+err_set_rg:
+	free(rg_line);
+}
+
+void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
+{
+	kstring_t str = {0,0,0};
+	sam_write_rg_line(&str, rg);
+	mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
+	if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
+	if (argc > 1) {
+		int i;
+		mm_sprintf_lite(&str, "\tCL:minimap2");
+		for (i = 1; i < argc; ++i)
+			mm_sprintf_lite(&str, " %s", argv[i]);
+	}
+	mm_sprintf_lite(&str, "\n");
+	fputs(str.s, stdout);
+	free(str.s);
+}
+
 static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
 {
 	extern unsigned char seq_nt4_table[256];
@@ -119,7 +185,11 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
 	mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
 	if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
 	if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
-	if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
+	if (r->p) {
+		mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
+		if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
+			mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
+	}
 }
 
 void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
@@ -137,7 +207,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
 		uint32_t k;
 		mm_sprintf_lite(s, "\tcg:Z:");
 		for (k = 0; k < r->p->n_cigar; ++k)
-			mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
+			mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
 	}
 	if (r->p && (opt_flag & MM_F_OUT_CS))
 		write_cs(km, s, mi, t, r);
@@ -154,6 +224,13 @@ static char comp_tab[] = {
 	'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
 };
 
+void mm_write_sam_SQ(const mm_idx_t *idx)
+{
+	uint32_t i;
+	for (i = 0; i < idx->n_seq; ++i)
+		printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
+}
+
 static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
 {
 	if (rev) {
@@ -187,7 +264,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
 			int clip_char = (flag&0x800)? 'H' : 'S';
 			if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
 			for (k = 0; k < r->p->n_cigar; ++k)
-				mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
+				mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
 			clip_len = r->rev? r->qs : t->l_seq - r->qe;
 			if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
 		} else mm_sprintf_lite(s, "*");
@@ -206,6 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
 			else mm_sprintf_lite(s, "*");
 		}
 		write_tags(s, r);
+		if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
 		if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
 			int i, n_sa = 0; // n_sa: number of SA fields
 			for (i = 0; i < n_regs; ++i)
diff --git a/index.c b/index.c
index c139ec5..dc58231 100644
--- a/index.c
+++ b/index.c
@@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
 mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
 {
 	pipeline_t pl;
+	if (fp == 0 || mm_bseq_eof(fp)) return 0;
 	memset(&pl, 0, sizeof(pipeline_t));
 	pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
 	pl.keep_name = keep_name;
 	pl.batch_size = batch_size;
 	pl.fp = fp;
-	if (pl.fp == 0) return 0;
 	pl.mi = mm_idx_init(w, k, b, is_hpc);
 
 	kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
diff --git a/ksw2.h b/ksw2.h
index b7774a8..5e43970 100644
--- a/ksw2.h
+++ b/ksw2.h
@@ -12,6 +12,8 @@
 #define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse
 #define KSW_EZ_EXTZ_ONLY   0x40 // only perform extension
 #define KSW_EZ_REV_CIGAR   0x80 // reverse CIGAR in the output
+#define KSW_EZ_SPLICE_FOR  0x100
+#define KSW_EZ_SPLICE_REV  0x200
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,6 +55,9 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
 void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
 
+void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
+
 void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
 
 /**
@@ -106,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
 //   bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
 //   bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
 //   bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
-static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
+static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
+								 int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
 { // p[] - lower 3 bits: which type gets the max; bit
 	int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
 	uint32_t *cigar = *cigar_, tmp;
@@ -127,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t
 		if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
 		if (force_state >= 0) state = force_state;
 		if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
-		else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
+		else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
+		else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
 		else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
 	}
 	if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c
index e0f5337..56cd8cd 100644
--- a/ksw2_extd2_sse.c
+++ b/ksw2_extd2_sse.c
@@ -1,5 +1,6 @@
 #include <string.h>
 #include <stdio.h>
+#include <assert.h>
 #include "ksw2.h"
 
 #ifdef __SSE2__
@@ -66,7 +67,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	if (w < 0) w = tlen > qlen? tlen : qlen;
 	wl = wr = w;
 	tlen_ = (tlen + 15) / 16;
-	n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
+	n_col_ = qlen < tlen? qlen : tlen;
+	n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
 	qlen_ = (qlen + 15) / 16;
 	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
 		max_sc = max_sc > mat[t]? max_sc : mat[t];
@@ -161,6 +163,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 		x21_ = _mm_cvtsi32_si128((uint8_t)x21);
 		v1_  = _mm_cvtsi32_si128((uint8_t)v1);
 		st_ = st / 16, en_ = en / 16;
+		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
@@ -362,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	if (with_cigar) { // backtrack
 		int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
 		if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
-			ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
 		else if (ez->max_t >= 0 && ez->max_q >= 0)
-			ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
 		kfree(km, mem2); kfree(km, off);
 	}
 }
diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c
new file mode 100644
index 0000000..6decd2d
--- /dev/null
+++ b/ksw2_exts2_sse.c
@@ -0,0 +1,357 @@
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "ksw2.h"
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
+{
+#define __dp_code_block1 \
+	z = _mm_load_si128(&s[t]); \
+	xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
+	tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
+	xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
+	x1_ = tmp; \
+	vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
+	tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
+	vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
+	v1_ = tmp; \
+	a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
+	ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
+	b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
+	x2t1= _mm_load_si128(&x2[t]); \
+	tmp = _mm_srli_si128(x2t1, 15); \
+	x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
+	x21_= tmp; \
+	a2  = _mm_add_epi8(x2t1, vt1); \
+	a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t]));
+
+#define __dp_code_block2 \
+	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
+	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
+	tmp = _mm_sub_epi8(z, q_); \
+	a = _mm_sub_epi8(a, tmp); \
+	b = _mm_sub_epi8(b, tmp); \
+	a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
+
+	int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
+	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
+	int32_t *H = 0, H0 = 0, last_H0_t = 0;
+	uint8_t *qr, *sf, *mem, *mem2 = 0;
+	__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, m1_;
+	__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
+
+	ksw_reset_extz(ez);
+	if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
+
+	zero_   = _mm_set1_epi8(0);
+	q_      = _mm_set1_epi8(q);
+	q2_     = _mm_set1_epi8(q2);
+	qe_     = _mm_set1_epi8(q + e);
+	sc_mch_ = _mm_set1_epi8(mat[0]);
+	sc_mis_ = _mm_set1_epi8(mat[1]);
+	m1_     = _mm_set1_epi8(m - 1); // wildcard
+
+	tlen_ = (tlen + 15) / 16;
+	n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1;
+	qlen_ = (qlen + 15) / 16;
+	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
+		max_sc = max_sc > mat[t]? max_sc : mat[t];
+		min_sc = min_sc < mat[t]? min_sc : mat[t];
+	}
+	if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
+
+	long_thres = (q2 - q) / e - 1;
+	if (q2 > q + e + long_thres * e)
+		++long_thres;
+	long_diff = long_thres * e - (q2 - q);
+
+	mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16);
+	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
+	v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_;
+	donor = x2 + tlen_, acceptor = donor + tlen_;
+	s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
+	memset(u,  -q - e,  tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array)
+	memset(x2, -q2,     tlen_ * 16);
+	if (!approx_max) {
+		H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
+		for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
+	}
+	if (with_cigar) {
+		mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
+		p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
+		off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
+		off_end = off + qlen + tlen - 1;
+	}
+
+	for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
+	memcpy(sf, target, tlen);
+
+	// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
+	if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
+		memset(donor, -noncan, tlen_ * 16);
+		for (t = 0; t < tlen - 2; ++t) {
+			int is_can = 0; // is a canonical site
+			if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1;
+			if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1;
+			if (is_can) ((int8_t*)donor)[t] = 0;
+		}
+		memset(acceptor, -noncan, tlen_ * 16);
+		for (t = 2; t < tlen; ++t) {
+			int is_can = 0;
+			if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1;
+			if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1;
+			if (is_can) ((int8_t*)acceptor)[t] = 0;
+		}
+	}
+
+	for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
+		int st = 0, en = tlen - 1, st0, en0, st_, en_;
+		int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v;
+		uint8_t *qrr = qr + (qlen - 1 - r);
+		__m128i x1_, x21_, v1_;
+		// find the boundaries
+		if (st < r - qlen + 1) st = r - qlen + 1;
+		if (en > r) en = r;
+		st0 = st, en0 = en;
+		st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
+		// set boundary conditions
+		if (st > 0) {
+			if (st - 1 >= last_st && st - 1 <= last_en)
+				x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
+			else x1 = -q - e, x21 = -q2, v1 = -q - e;
+		} else {
+			x1 = -q - e, x21 = -q2;
+			v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
+		}
+		if (en >= r) {
+			((int8_t*)y)[r] = -q - e;
+			u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
+		}
+		// loop fission: set scores first
+		if (!(flag & KSW_EZ_GENERIC_SC)) {
+			for (t = st0; t <= en0; t += 16) {
+				__m128i sq, st, tmp, mask;
+				sq = _mm_loadu_si128((__m128i*)&sf[t]);
+				st = _mm_loadu_si128((__m128i*)&qrr[t]);
+				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+				tmp = _mm_cmpeq_epi8(sq, st);
+#ifdef __SSE4_1__
+				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+#else
+				tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
+#endif
+				tmp = _mm_andnot_si128(mask, tmp);
+				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+			}
+		} else {
+			for (t = st0; t <= en0; ++t)
+				((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
+		}
+		// core loop
+		x1_  = _mm_cvtsi32_si128((uint8_t)x1);
+		x21_ = _mm_cvtsi32_si128((uint8_t)x21);
+		v1_  = _mm_cvtsi32_si128((uint8_t)v1);
+		st_ = st / 16, en_ = en / 16;
+		assert(en_ - st_ + 1 <= n_col_);
+		if (!with_cigar) { // score only
+			for (t = st_; t <= en_; ++t) {
+				__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
+				__dp_code_block1;
+#ifdef __SSE4_1__
+				z = _mm_max_epi8(z, a);
+				z = _mm_max_epi8(z, b);
+				z = _mm_max_epi8(z, a2a);
+				__dp_code_block2; // save u[] and v[]; update a, b and a2
+				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_max_epi8(a,  zero_), qe_));
+				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
+				tmp = _mm_load_si128(&donor[t]);
+				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
+#else
+				tmp = _mm_cmpgt_epi8(a,  z);
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+				tmp = _mm_cmpgt_epi8(b,  z);
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+				tmp = _mm_cmpgt_epi8(a2a, z);
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+				__dp_code_block2;
+				tmp = _mm_cmpgt_epi8(a, zero_);
+				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+				tmp = _mm_cmpgt_epi8(b, zero_);
+				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
+				tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
+				tmp = _mm_cmpgt_epi8(a2, tmp);
+				tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
+				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
+#endif
+			}
+		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+			__m128i *pr = p + r * n_col_ - st_;
+			off[r] = st, off_end[r] = en;
+			for (t = st_; t <= en_; ++t) {
+				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+				__dp_code_block1;
+#ifdef __SSE4_1__
+				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
+				z = _mm_max_epi8(z, a);
+				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
+				z = _mm_max_epi8(z, b);
+				d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
+				z = _mm_max_epi8(z, a2a);
+#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+				tmp = _mm_cmpgt_epi8(a,  z);
+				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
+				tmp = _mm_cmpgt_epi8(b,  z);
+				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
+				tmp = _mm_cmpgt_epi8(a2a, z);
+				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
+				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
+#endif
+				__dp_code_block2;
+				tmp = _mm_cmpgt_epi8(a, zero_);
+				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
+				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = _mm_cmpgt_epi8(b, zero_);
+				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
+				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+
+				tmp2 = _mm_load_si128(&donor[t]);
+				tmp = _mm_cmpgt_epi8(a2, tmp2);
+#ifdef __SSE4_1__
+				tmp2 = _mm_max_epi8(a2, tmp2);
+#else
+				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
+#endif
+				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
+				_mm_store_si128(&pr[t], d);
+			}
+		} else { // gap right-alignment
+			__m128i *pr = p + r * n_col_ - st_;
+			off[r] = st, off_end[r] = en;
+			for (t = st_; t <= en_; ++t) {
+				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+				__dp_code_block1;
+#ifdef __SSE4_1__
+				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
+				z = _mm_max_epi8(z, a);
+				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
+				z = _mm_max_epi8(z, b);
+				d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
+				z = _mm_max_epi8(z, a2a);
+#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
+				tmp = _mm_cmpgt_epi8(z, a);
+				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
+				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
+				tmp = _mm_cmpgt_epi8(z, b);
+				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
+				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
+				tmp = _mm_cmpgt_epi8(z, a2a);
+				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
+				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
+#endif
+				__dp_code_block2;
+				tmp = _mm_cmpgt_epi8(zero_, a);
+				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
+				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = _mm_cmpgt_epi8(zero_, b);
+				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, b),  qe_));
+				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+
+				tmp2 = _mm_load_si128(&donor[t]);
+				tmp = _mm_cmpgt_epi8(tmp2, a2);
+#ifdef __SSE4_1__
+				tmp2 = _mm_max_epi8(tmp2, a2);
+#else
+				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
+#endif
+				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
+				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+				_mm_store_si128(&pr[t], d);
+			}
+		}
+		if (!approx_max) { // find the exact max with a 32-bit score array
+			int32_t max_H, max_t;
+			// compute H[], max_H and max_t
+			if (r > 0) {
+				int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
+				__m128i max_H_, max_t_;
+				max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
+				max_t = en0;
+				max_H_ = _mm_set1_epi32(max_H);
+				max_t_ = _mm_set1_epi32(max_t);
+				for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
+					__m128i H1, tmp, t_;
+					H1 = _mm_loadu_si128((__m128i*)&H[t]);
+					t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
+					H1 = _mm_add_epi32(H1, t_);
+					_mm_storeu_si128((__m128i*)&H[t], H1);
+					t_ = _mm_set1_epi32(t);
+					tmp = _mm_cmpgt_epi32(H1, max_H_);
+#ifdef __SSE4_1__
+					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+#else
+					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+#endif
+				}
+				_mm_storeu_si128((__m128i*)HH, max_H_);
+				_mm_storeu_si128((__m128i*)tt, max_t_);
+				for (i = 0; i < 4; ++i)
+					if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
+				for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
+					H[t] += (int32_t)v8[t];
+					if (H[t] > max_H)
+						max_H = H[t], max_t = t;
+				}
+			} else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
+			// update ez
+			if (en0 == tlen - 1 && H[en0] > ez->mte)
+				ez->mte = H[en0], ez->mte_q = r - en;
+			if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
+				ez->mqe = H[st0], ez->mqe_t = st0;
+			if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break;
+			if (r == qlen + tlen - 2 && en0 == tlen - 1)
+				ez->score = H[tlen - 1];
+		} else { // find approximate max; Z-drop might be inaccurate, too.
+			if (r > 0) {
+				if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
+					int32_t d0 = v8[last_H0_t];
+					int32_t d1 = u8[last_H0_t + 1];
+					if (d0 > d1) H0 += d0;
+					else H0 += d1, ++last_H0_t;
+				} else if (last_H0_t >= st0 && last_H0_t <= en0) {
+					H0 += v8[last_H0_t];
+				} else {
+					++last_H0_t, H0 += u8[last_H0_t];
+				}
+			} else H0 = v8[0] - qe, last_H0_t = 0;
+			if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, 0)) break;
+			if (r == qlen + tlen - 2 && en0 == tlen - 1)
+				ez->score = H0;
+		}
+		last_st = st, last_en = en;
+		//for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
+	}
+	kfree(km, mem);
+	if (!approx_max) kfree(km, H);
+	if (with_cigar) { // backtrack
+		int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
+		if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
+			ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+		else if (ez->max_t >= 0 && ez->max_q >= 0)
+			ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+		kfree(km, mem2); kfree(km, off);
+	}
+}
+#endif // __SSE2__
diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c
index d665c6f..f21f184 100644
--- a/ksw2_extz2_sse.c
+++ b/ksw2_extz2_sse.c
@@ -1,4 +1,5 @@
 #include <string.h>
+#include <assert.h>
 #include "ksw2.h"
 
 #ifdef __SSE2__
@@ -58,7 +59,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	if (w < 0) w = tlen > qlen? tlen : qlen;
 	wl = wr = w;
 	tlen_ = (tlen + 15) / 16;
-	n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
+	n_col_ = qlen < tlen? qlen : tlen;
+	n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
 	qlen_ = (qlen + 15) / 16;
 	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
 		max_sc = max_sc > mat[t]? max_sc : mat[t];
@@ -130,6 +132,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 		x1_ = _mm_cvtsi32_si128(x1);
 		v1_ = _mm_cvtsi32_si128(v1);
 		st_ = st / 16, en_ = en / 16;
+		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, xt1, vt1, ut, tmp;
@@ -275,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
 	if (with_cigar) { // backtrack
 		int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
 		if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
-			ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
 		else if (ez->max_t >= 0 && ez->max_q >= 0)
-			ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
 		kfree(km, mem2); kfree(km, off);
 	}
 }
diff --git a/main.c b/main.c
index 994e977..d67b61c 100644
--- a/main.c
+++ b/main.c
@@ -8,7 +8,7 @@
 #include "minimap.h"
 #include "mmpriv.h"
 
-#define MM_VERSION "2.0rc1-r271-dirty"
+#define MM_VERSION "2.1-r311"
 
 void liftrlimit()
 {
@@ -31,6 +31,11 @@ static struct option long_options[] = {
 	{ "max-chain-skip", required_argument, 0, 0 },
 	{ "min-dp-len",     required_argument, 0, 0 },
 	{ "print-aln-seq",  no_argument,       0, 0 },
+	{ "splice",         no_argument,       0, 0 },
+	{ "cost-non-gt-ag", required_argument, 0, 0 },
+	{ "no-sam-sq",      no_argument,       0, 0 },
+	{ "help",           no_argument,       0, 'h' },
+	{ "max-intron-len", required_argument, 0, 'G' },
 	{ "version",        no_argument,       0, 'V' },
 	{ "min-count",      required_argument, 0, 'n' },
 	{ "min-chain-score",required_argument, 0, 'm' },
@@ -40,30 +45,42 @@ static struct option long_options[] = {
 	{ 0, 0, 0, 0}
 };
 
+static inline int64_t mm_parse_num(const char *str)
+{
+	double x;
+	char *p;
+	x = strtod(optarg, &p);
+	if (*p == 'G' || *p == 'g') x *= 1e9;
+	else if (*p == 'M' || *p == 'm') x *= 1e6;
+	else if (*p == 'K' || *p == 'k') x *= 1e3;
+	return (int64_t)(x + .499);
+}
+
 int main(int argc, char *argv[])
 {
 	mm_mapopt_t opt;
-	int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0;
+	int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
 	int minibatch_size = 200000000;
 	uint64_t batch_size = 4000000000ULL;
 	mm_bseq_file_t *fp = 0;
-	char *fnw = 0, *s;
-	FILE *fpr = 0, *fpw = 0;
+	char *fnw = 0, *rg = 0, *s;
+	FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
 
 	liftrlimit();
 	mm_realtime0 = realtime();
 	mm_mapopt_init(&opt);
 
-	while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
+	while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
 		if (c == 'w') w = atoi(optarg), idx_par_set = 1;
 		else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
 		else if (c == 'H') is_hpc = 1, idx_par_set = 1;
 		else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
-		else if (c == 'r') opt.bw = atoi(optarg);
+		else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
 		else if (c == 'f') opt.mid_occ_frac = atof(optarg);
 		else if (c == 't') n_threads = atoi(optarg);
 		else if (c == 'v') mm_verbose = atoi(optarg);
-		else if (c == 'g') opt.max_gap = atoi(optarg);
+		else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
+		else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg);
 		else if (c == 'N') opt.best_n = atoi(optarg);
 		else if (c == 'p') opt.pri_ratio = atof(optarg);
 		else if (c == 'M') opt.mask_level = atof(optarg);
@@ -79,6 +96,10 @@ int main(int argc, char *argv[])
 		else if (c == 'B') opt.b = atoi(optarg);
 		else if (c == 'z') opt.zdrop = atoi(optarg);
 		else if (c == 's') opt.min_dp_max = atoi(optarg);
+		else if (c == 'I') batch_size = mm_parse_num(optarg);
+		else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
+		else if (c == 'R') rg = optarg;
+		else if (c == 'h') fp_help = stdout;
 		else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
 		else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
 		else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
@@ -88,24 +109,28 @@ int main(int argc, char *argv[])
 		else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
 		else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
 		else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
+		else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
+		else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
+		else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
 		else if (c == 'V') {
 			puts(MM_VERSION);
 			return 0;
+		} else if (c == 'u') {
+			if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV;
+			else if (*optarg == 'B') opt.flag |= MM_F_SPLICE_BOTH;
+			else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV;
+			else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR;
+			else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
+			else {
+				fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__);
+				return 1;
+			}
 		} else if (c == 'O') {
 			opt.q = opt.q2 = strtol(optarg, &s, 10);
 			if (*s == ',') opt.q2 = strtol(s + 1, &s, 10);
 		} else if (c == 'E') {
 			opt.e = opt.e2 = strtol(optarg, &s, 10);
 			if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
-		} else if (c == 'I' || c == 'K') {
-			double x;
-			char *p;
-			x = strtod(optarg, &p);
-			if (*p == 'G' || *p == 'g') x *= 1e9;
-			else if (*p == 'M' || *p == 'm') x *= 1e6;
-			else if (*p == 'K' || *p == 'k') x *= 1e3;
-			if (c == 'I') batch_size = (uint64_t)(x + .499);
-			else minibatch_size = (uint64_t)(x + .499);
 		} else if (c == 'x') {
 			if (strcmp(optarg, "ava-ont") == 0) {
 				opt.flag |= MM_F_AVA | MM_F_NO_SELF;
@@ -139,6 +164,13 @@ int main(int argc, char *argv[])
 				opt.min_dp_max = 50;
 				opt.best_n = 10;
 				opt.bw = 100;
+			} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
+				k = 15, w = 5;
+				opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
+				opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000;
+				opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
+				opt.noncan = 5;
+				opt.zdrop = 200;
 			} else {
 				fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
 				return 1;
@@ -146,73 +178,87 @@ int main(int argc, char *argv[])
 		}
 	}
 	if (w < 0) w = (int)(.6666667 * k + .499);
+	if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
+		opt.max_gap_ref = opt.bw = max_intron_len;
 
-	if (argc == optind) {
-		fprintf(stderr, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
-		fprintf(stderr, "Options:\n");
-		fprintf(stderr, "  Indexing:\n");
-		fprintf(stderr, "    -H           use homopolymer-compressed k-mer\n");
-		fprintf(stderr, "    -k INT       k-mer size (no larger than 28) [%d]\n", k);
-		fprintf(stderr, "    -w INT       minizer window size [{-k}*2/3]\n");
-		fprintf(stderr, "    -I NUM       split index for every ~NUM input bases [4G]\n");
-		fprintf(stderr, "    -d FILE      dump index to FILE []\n");
-		fprintf(stderr, "  Mapping:\n");
-		fprintf(stderr, "    -f FLOAT     filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
-		fprintf(stderr, "    -g INT       stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
-		fprintf(stderr, "    -r INT       bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
-		fprintf(stderr, "    -n INT       minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
-		fprintf(stderr, "    -m INT       minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
-//		fprintf(stderr, "    -T INT       SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
-		fprintf(stderr, "    -X           skip self and dual mappings (for the all-vs-all mode)\n");
-		fprintf(stderr, "    -p FLOAT     min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
-		fprintf(stderr, "    -N INT       retain at most INT secondary alignments [%d]\n", opt.best_n);
-		fprintf(stderr, "  Alignment:\n");
-		fprintf(stderr, "    -A INT       matching score [%d]\n", opt.a);
-		fprintf(stderr, "    -B INT       mismatch penalty [%d]\n", opt.b);
-		fprintf(stderr, "    -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
-		fprintf(stderr, "    -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
-		fprintf(stderr, "    -z INT       Z-drop score [%d]\n", opt.zdrop);
-		fprintf(stderr, "    -s INT       minimal peak DP alignment score [%d]\n", opt.min_dp_max);
-		fprintf(stderr, "  Input/Output:\n");
-		fprintf(stderr, "    -Q           ignore base quality in the input\n");
-		fprintf(stderr, "    -a           output in the SAM format (PAF by default)\n");
-		fprintf(stderr, "    -c           output CIGAR in PAF\n");
-		fprintf(stderr, "    -S           output the cs tag in PAF\n");
-		fprintf(stderr, "    -t INT       number of threads [%d]\n", n_threads);
-		fprintf(stderr, "    -K NUM       minibatch size [200M]\n");
-//		fprintf(stderr, "    -v INT       verbose level [%d]\n", mm_verbose);
-		fprintf(stderr, "    -V           show version number\n");
-		fprintf(stderr, "  Preset:\n");
-		fprintf(stderr, "    -x STR       preset (recommended to be applied before other options) []\n");
-		fprintf(stderr, "                 map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
-		fprintf(stderr, "                 map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
-		fprintf(stderr, "                 asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
-		fprintf(stderr, "                 asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
-		fprintf(stderr, "                 ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
-		fprintf(stderr, "                 ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
-		fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
-		return 1;
+	if (argc == optind || fp_help == stdout) {
+		fprintf(fp_help, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
+		fprintf(fp_help, "Options:\n");
+		fprintf(fp_help, "  Indexing:\n");
+		fprintf(fp_help, "    -H           use homopolymer-compressed k-mer\n");
+		fprintf(fp_help, "    -k INT       k-mer size (no larger than 28) [%d]\n", k);
+		fprintf(fp_help, "    -w INT       minizer window size [{-k}*2/3]\n");
+		fprintf(fp_help, "    -I NUM       split index for every ~NUM input bases [4G]\n");
+		fprintf(fp_help, "    -d FILE      dump index to FILE []\n");
+		fprintf(fp_help, "  Mapping:\n");
+		fprintf(fp_help, "    -f FLOAT     filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
+		fprintf(fp_help, "    -g INT       stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
+		fprintf(fp_help, "    -r INT       bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
+		fprintf(fp_help, "    -n INT       minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
+		fprintf(fp_help, "    -m INT       minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
+//		fprintf(fp_help, "    -T INT       SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
+		fprintf(fp_help, "    -X           skip self and dual mappings (for the all-vs-all mode)\n");
+		fprintf(fp_help, "    -p FLOAT     min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
+		fprintf(fp_help, "    -N INT       retain at most INT secondary alignments [%d]\n", opt.best_n);
+		fprintf(fp_help, "    -G NUM       max intron length (only effective following -x splice) [200k]\n");
+		fprintf(fp_help, "  Alignment:\n");
+		fprintf(fp_help, "    -A INT       matching score [%d]\n", opt.a);
+		fprintf(fp_help, "    -B INT       mismatch penalty [%d]\n", opt.b);
+		fprintf(fp_help, "    -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
+		fprintf(fp_help, "    -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
+		fprintf(fp_help, "    -z INT       Z-drop score [%d]\n", opt.zdrop);
+		fprintf(fp_help, "    -s INT       minimal peak DP alignment score [%d]\n", opt.min_dp_max);
+		fprintf(fp_help, "    -u CHAR      how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
+		fprintf(fp_help, "  Input/Output:\n");
+		fprintf(fp_help, "    -a           output in the SAM format (PAF by default)\n");
+		fprintf(fp_help, "    -Q           don't output base quality in SAM\n");
+		fprintf(fp_help, "    -R STR       SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
+		fprintf(fp_help, "    -c           output CIGAR in PAF\n");
+		fprintf(fp_help, "    -S           output the cs tag in PAF (cs encodes both query and ref sequences)\n");
+		fprintf(fp_help, "    -t INT       number of threads [%d]\n", n_threads);
+		fprintf(fp_help, "    -K NUM       minibatch size [200M]\n");
+//		fprintf(fp_help, "    -v INT       verbose level [%d]\n", mm_verbose);
+		fprintf(fp_help, "    --version    show version number\n");
+		fprintf(fp_help, "  Preset:\n");
+		fprintf(fp_help, "    -x STR       preset (recommended to be applied before other options) []\n");
+		fprintf(fp_help, "                 map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
+		fprintf(fp_help, "                 map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
+		fprintf(fp_help, "                 asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
+		fprintf(fp_help, "                 asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
+		fprintf(fp_help, "                 ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
+		fprintf(fp_help, "                 ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
+		fprintf(fp_help, "                 splice: long-read spliced alignment (see minimap2.1 for details)\n");
+		fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
+		return fp_help == stdout? 0 : 1;
 	}
 
 	is_idx = mm_idx_is_idx(argv[optind]);
 	if (is_idx < 0) {
-		fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
+		fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
+		return 1;
+	}
+	if (!is_idx && fnw == 0 && argc - optind < 2) {
+		fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
 		return 1;
 	}
 	if (is_idx) fpr = fopen(argv[optind], "rb");
 	else fp = mm_bseq_open(argv[optind]);
 	if (fnw) fpw = fopen(fnw, "wb");
+	if (opt.flag & MM_F_OUT_SAM)
+		mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
 	for (;;) {
-		mm_idx_t *mi = 0;
+		mm_idx_t *mi;
 		if (fpr) {
 			mi = mm_idx_load(fpr);
 			if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
-				fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n",
-						__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
-		} else if (!mm_bseq_eof(fp)) {
+				fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
+		} else {
 			mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
 		}
 		if (mi == 0) break;
+		++n_idx_part;
+		if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
+			fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
 		if (mm_verbose >= 3)
 			fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
 					__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
diff --git a/map.c b/map.c
index 4ea111e..f531efa 100644
--- a/map.c
+++ b/map.c
@@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
 	opt->min_chain_score = 40;
 	opt->bw = 500;
 	opt->max_gap = 5000;
+	opt->max_gap_ref = -1;
 	opt->max_chain_skip = 25;
 
 	opt->mask_level = 0.5f;
@@ -37,6 +38,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
 
 void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
 {
+	if (opt->flag & MM_F_SPLICE_BOTH)
+		opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
 	opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac);
 	opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
 	if (mm_verbose >= 3)
@@ -167,7 +170,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
 #endif
 mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs)
 {
-	int i, n = m_en - m_st, j, n_u;
+	int i, n = m_en - m_st, j, n_u, max_gap_ref;
 	int64_t n_a;
 	uint64_t *u;
 	mm_match_t *m;
@@ -243,7 +246,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
 			fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
 					i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
 
-	n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
+	max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
+	n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km);
 	regs = mm_gen_regs(b->km, qlen, n_u, u, a);
 	*n_regs = n_u;
 
@@ -256,7 +260,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
 	if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
 		mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
 		mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
-		mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
+		if (!(opt->flag & MM_F_SPLICE))
+			mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
 	}
 	if (opt->flag & MM_F_CIGAR) {
 		regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
@@ -348,8 +353,10 @@ static void *worker_pipeline(void *shared, int step, void *in)
 			mm_bseq1_t *t = &s->seq[i];
 			for (j = 0; j < s->n_reg[i]; ++j) {
 				mm_reg1_t *r = &s->reg[i][j];
-				if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
-				else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
+				if (p->opt->flag & MM_F_OUT_SAM)
+					mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
+				else
+					mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
 				puts(p->str.s);
 			}
 			if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
@@ -378,11 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
 	if (pl.fp == 0) return -1;
 	pl.opt = opt, pl.mi = idx;
 	pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
-	if (opt->flag & MM_F_OUT_SAM) {
-		uint32_t i;
-		for (i = 0; i < idx->n_seq; ++i)
-			printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
-	}
+	if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
+		mm_write_sam_SQ(idx);
 	kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
 	free(pl.str.s);
 	mm_bseq_close(pl.fp);
diff --git a/minimap.h b/minimap.h
index 076017f..e345915 100644
--- a/minimap.h
+++ b/minimap.h
@@ -5,15 +5,20 @@
 #include <stdio.h>
 #include <sys/types.h>
 
-#define MM_IDX_DEF_B    14
+#define MM_IDX_DEF_B     14
 
-#define MM_F_NO_SELF   0x01
-#define MM_F_AVA       0x02
-#define MM_F_CIGAR     0x04
-#define MM_F_OUT_SAM   0x08
-#define MM_F_NO_QUAL   0x10
-#define MM_F_OUT_CG    0x20
-#define MM_F_OUT_CS    0x40
+#define MM_F_NO_SELF     0x001
+#define MM_F_AVA         0x002
+#define MM_F_CIGAR       0x004
+#define MM_F_OUT_SAM     0x008
+#define MM_F_NO_QUAL     0x010
+#define MM_F_OUT_CG      0x020
+#define MM_F_OUT_CS      0x040
+#define MM_F_SPLICE      0x080
+#define MM_F_SPLICE_FOR  0x100
+#define MM_F_SPLICE_REV  0x200
+#define MM_F_SPLICE_BOTH 0x400
+#define MM_F_NO_SAM_SQ   0x800
 
 #define MM_IDX_MAGIC   "MMI\2"
 
@@ -55,7 +60,8 @@ typedef struct {
 	uint32_t capacity;
 	int32_t dp_score, dp_max, dp_max2;
 	uint32_t blen;
-	uint32_t n_diff, n_ambi;
+	uint32_t n_diff;
+	uint32_t n_ambi:30, trans_strand:2;
 	uint32_t n_cigar;
 	uint32_t cigar[];
 } mm_extra_t;
@@ -80,7 +86,7 @@ typedef struct {
 	int flag;    // see MM_F_* macros
 
 	int bw;  // bandwidth
-	int max_gap; // break a chain if there are no minimizers in a max_gap window
+	int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
 	int max_chain_skip;
 	int min_cnt;
 	int min_chain_score;
@@ -93,6 +99,7 @@ typedef struct {
 	int min_join_flank_sc;
 
 	int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
+	int noncan;
 	int zdrop;
 	int min_dp_max;
 	int min_ksw_len;
diff --git a/minimap2.1 b/minimap2.1
index 0e63a11..61a0bdb 100644
--- a/minimap2.1
+++ b/minimap2.1
@@ -1,4 +1,4 @@
-.TH minimap2 1 "30 July 2017" "minimap2-2.0rc1-r232" "Bioinformatics tools"
+.TH minimap2 1 "25 August 2017" "minimap2-2.1-r311" "Bioinformatics tools"
 .SH NAME
 .PP
 minimap2 - mapping and alignment between collections of DNA sequences
@@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when
 .B -X
 is applied.
 .TP
+.BI -G \ NUM
+Maximal intron length in the splice mode [200k]. This option also changes the
+bandwidth to
+.IR NUM .
+Increasing this option slows down spliced alignment.
+.TP
 .BI --max-chain-skip \ INT
 A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
 for chaining. The time complexity is quadratic in the number of seeds. This
@@ -199,15 +205,32 @@ the contiguity of the alignment at the cost of poor alignment in the middle
 Minimal peak DP alignment score to output [40]. The peak score is computed from
 the final CIGAR. It is the score of the max scoring segment in the alignment
 and may be different from the total alignment score.
+.TP
+.BI -u \ CHAR
+How to find canonical splicing sites GT-AG -
+.BR f :
+transcript strand;
+.BR b :
+both strands;
+.BR n :
+no attempt to match GT-AG [n]
+.TP
+.BI --cost-non-gt-ag \ INT
+Cost of non-canonical splicing sites [0].
 .SS Input/output options
 .TP 10
-.B -Q
-Ignore base quality in the input file.
-.TP
 .B -a
 Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
 by default.
 .TP
+.B -Q
+Ignore base quality in the input file.
+.TP
+.BI -R \ STR
+SAM read group line in a format like
+.B @RG\\\\tID:foo\\\\tSM:bar
+[].
+.TP
 .B -c
 Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
 .TP
@@ -232,8 +255,13 @@ and
 use
 .BR -K500m .
 .TP
-.B -V
+.B --version
 Print version number to stdout
+.TP
+.B --no-sam-hdr
+Don't output SAM header lines. Use this option if the index consists of
+multiple parts; otherwise the SAM output is malformated due to internal header
+lines.
 .SS Preset options
 .TP 10
 .BI -x \ STR
@@ -247,37 +275,64 @@ are:
 .RS
 .TP 8
 .B map-pb
-PacBio/Oxford Nanopore read to reference mapping (-Hk19)
+PacBio/Oxford Nanopore read to reference mapping
+.RB ( -Hk19 )
 .TP
 .B map10k
 The same as
 .B map-pb
-(-Hk19)
+.RB ( -Hk19 )
 .TP
 .B map-ont
-Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For
-PacBio reads, HPC minimizers consistently leads to faster performance and more
-sensitive results in comparison to normal minimizers. For Oxford Nanopore data,
-normal minimizers are better, though not much. The effectiveness of HPC is
-determined by the sequencing error mode.
+Slightly more sensitive for Oxford Nanopore to reference mapping
+.RB ( -k15 ).
+For PacBio reads, HPC minimizers consistently leads to faster performance and
+more sensitive results in comparison to normal minimizers. For Oxford Nanopore
+data, normal minimizers are better, though not much. The effectiveness of HPC
+is determined by the sequencing error mode.
 .TP
 .B asm5
-Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200).
+Long assembly to reference mapping
+.RB ( -k19
+.B -w19 -A1 -B19 -O39,81 -E3,1 -s200
+.BR -z200 ).
 Typically, the alignment will not extend to regions with 5% or higher sequence
 divergence. Only use this preset if the average divergence is far below 5%.
 .TP
 .B asm10
-Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200). Up
-to 10% sequence divergence.
-.TP 8
+Long assembly to reference mapping
+.RB ( -k19
+.B -w19 -A1 -B9 -O16,41 -E2,1 -s200
+.BR -z200 ).
+Up to 10% sequence divergence.
+.TP
 .B ava-pb
-PacBio all-vs-all overlap mapping (-Hk19 -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip 25)
-.TP 8
+PacBio all-vs-all overlap mapping
+.RB ( -Hk19
+.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
+.BR 25 ).
+.TP
 .B ava-ont
-Oxford Nanopore all-vs-all overlap mapping (-k15 -w5 -Xp0 -m100 -K500m -g10000
---max-chain-skip 25). Similarly, the major difference from
+Oxford Nanopore all-vs-all overlap mapping
+.RB ( -k15
+.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
+.BR 25 ).
+Similarly, the major difference from
 .B ava-pb
 is that this preset is not using HPC minimizers.
+.TP
+.B splice
+Long-read spliced alignment
+.RB ( -k15
+.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
+.BR 5 ).
+In the splice mode, 1) long deletions are taken as introns and represented as
+the
+.RB ` N '
+CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
+costs are different during chaining; 4) the computation of the
+.RB ` ms '
+tag ignores introns to demote hits to pseudogenes.
 .RE
 .SS Miscellaneous options
 .TP 10
diff --git a/misc/intron-eval.js b/misc/intron-eval.js
new file mode 100644
index 0000000..3505355
--- /dev/null
+++ b/misc/intron-eval.js
@@ -0,0 +1,266 @@
+/*******************************
+ * Command line option parsing *
+ *******************************/
+
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+/***********************
+ * Interval operations *
+ ***********************/
+
+Interval = {};
+
+Interval.sort = function(a)
+{
+	if (typeof a[0] == 'number')
+		a.sort(function(x, y) { return x - y });
+	else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] });
+}
+
+Interval.merge = function(a, sorted)
+{
+	if (typeof sorted == 'undefined') sorted = true;
+	if (!sorted) Interval.sort(a);
+	var k = 0;
+	for (var i = 1; i < a.length; ++i) {
+		if (a[k][1] >= a[i][0])
+			a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1];
+		else a[++k] = a[i].slice(0);
+	}
+	a.length = k + 1;
+}
+
+Interval.index_end = function(a, sorted)
+{
+	if (a.length == 0) return;
+	if (typeof sorted == 'undefined') sorted = true;
+	if (!sorted) Interval.sort(a);
+	a[0].push(0);
+	var k = 0, k_en = a[0][1];
+	for (var i = 1; i < a.length; ++i) {
+		if (k_en <= a[i][0]) {
+			for (++k; k < i; ++k)
+				if (a[k][1] > a[i][0])
+					break;
+			k_en = a[k][1];
+		}
+		a[i].push(k);
+	}
+}
+
+Interval.find_intv = function(a, x)
+{
+	var left = -1, right = a.length;
+	if (typeof a[0] == 'number') {
+		while (right - left > 1) {
+			var mid = left + ((right - left) >> 1);
+			if (a[mid] > x) right = mid;
+			else if (a[mid] < x) left = mid;
+			else return mid;
+		}
+	} else {
+		while (right - left > 1) {
+			var mid = left + ((right - left) >> 1);
+			if (a[mid][0] > x) right = mid;
+			else if (a[mid][0] < x) left = mid;
+			else return mid;
+		}
+	}
+	return left;
+}
+
+Interval.find_ovlp = function(a, st, en)
+{
+	if (a.length == 0 || st >= en) return [];
+	var l = Interval.find_intv(a, st);
+	var k = l < 0? 0 : a[l][a[l].length - 1];
+	var b = [];
+	for (var i = k; i < a.length; ++i) {
+		if (a[i][0] >= en) break;
+		else if (st < a[i][1])
+			b.push(a[i]);
+	}
+	return b;
+}
+
+/*****************
+ * Main function *
+ *****************/
+
+var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false;
+while ((c = getopt(arguments, "l:ep")) != null) {
+	if (c == 'l') l_fuzzy = parseInt(getopt.arg);
+	else if (c == 'e') print_err_only = print_ovlp = true;
+	else if (c == 'p') print_ovlp = true;
+}
+
+if (arguments.length - getopt.ind < 2) {
+	print("Usage: k8 intron-eval.js [options] <gene.gtf> <aln.sam>");
+	exit(1);
+}
+
+var file, buf = new Bytes();
+
+var tr = {};
+file = new File(arguments[getopt.ind]);
+while (file.readline(buf) >= 0) {
+	var m, t = buf.toString().split("\t");
+	if (t[0].charAt(0) == '#') continue;
+	if (t[2] != 'exon') continue;
+	var st = parseInt(t[3]) - 1;
+	var en = parseInt(t[4]);
+	if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
+	var tid = m[1];
+	if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
+	tr[tid][4].push([st, en]);
+}
+file.close();
+
+var anno = {};
+for (var tid in tr) {
+	var t = tr[tid];
+	Interval.sort(t[4]);
+	t[2] = t[4][0][0];
+	t[3] = t[4][t[4].length - 1][1];
+	if (anno[t[0]] == null) anno[t[0]] = [];
+	var s = t[4];
+	for (var i = 0; i < s.length - 1; ++i) {
+		if (s[i][1] >= s[i+1][0]) throw Error("ERROR: wrong annotation!");
+		anno[t[0]].push([s[i][1], s[i+1][0]]);
+	}
+}
+tr = null;
+
+for (var chr in anno) {
+	var e = anno[chr];
+	if (e.length == 0) continue;
+	Interval.sort(e);
+	var k = 0;
+	for (var i = 1; i < e.length; ++i) // dedup
+		if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
+			e[++k] = e[i].slice(0);
+	e.length = k + 1;
+	Interval.index_end(e);
+}
+
+var n_pri = 0, n_unmapped = 0, n_mapped = 0;
+var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0;
+
+file = new File(arguments[getopt.ind+1]);
+var last_qname = null;
+var re_cigar = /(\d+)([MIDNSHX=])/g;
+while (file.readline(buf) >= 0) {
+	var m, t = buf.toString().split("\t");
+
+	if (t[0].charAt(0) == '@') continue;
+	var flag = parseInt(t[1]);
+	if (flag&0x100) continue;
+	if (first_only && last_qname == t[0]) continue;
+	if (t[2] == '*') {
+		++n_unmapped;
+		continue;
+	} else {
+		++n_pri;
+		if (last_qname != t[0]) ++n_mapped;
+	}
+
+	var pos = parseInt(t[3]) - 1, intron = [];
+	while ((m = re_cigar.exec(t[5])) != null) {
+		var len = parseInt(m[1]), op = m[2];
+		if (op == 'N') {
+			intron.push([pos, pos + len]);
+			pos += len;
+		} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
+	}
+	if (intron.length == 0) {
+		++n_sgl;
+		continue;
+	}
+	n_splice += intron.length;
+
+	var chr = anno[t[2]];
+	if (chr != null) {
+		for (var i = 0; i < intron.length; ++i) {
+			var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
+			if (o.length > 0) {
+				var hit = false;
+				for (var j = 0; j < o.length; ++j) {
+					var st_diff = intron[i][0] - o[j][0];
+					var en_diff = intron[i][1] - o[j][1];
+					if (st_diff < 0) st_diff = -st_diff;
+					if (en_diff < 0) en_diff = -en_diff;
+					if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
+						++n_splice_hit, hit = true;
+					if (hit) break;
+				}
+				if (print_ovlp) {
+					var type = hit? 'C' : 'P';
+					if (hit && print_err_only) continue;
+					var x = '[';
+					for (var j = 0; j < o.length; ++j) {
+						if (j) x += ', ';
+						x += '(' + o[j][0] + "," + o[j][1] + ')';
+					}
+					x += ']';
+					print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
+				}
+			} else {
+				++n_splice_novel;
+				if (print_ovlp)
+					print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
+			}
+		}
+	} else {
+		n_splice_novel += intron.length;
+	}
+	last_qname = t[0];
+}
+file.close();
+
+buf.destroy();
+
+if (!print_ovlp) {
+	print("# unmapped reads: " + n_unmapped);
+	print("# mapped reads: " + n_mapped);
+	print("# primary alignments: " + n_pri);
+	print("# singletons: " + n_sgl);
+	print("# predicted introns: " + n_splice);
+	print("# non-overlapping introns: " + n_splice_novel);
+	print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)");
+}
diff --git a/misc/mapstat.js b/misc/mapstat.js
index 2f80bb4..98e42c3 100644
--- a/misc/mapstat.js
+++ b/misc/mapstat.js
@@ -94,7 +94,7 @@ while (file.readline(buf) >= 0) {
 			ori_qlen = parseInt(t[1]);
 		} else { // SAM
 			var flag = parseInt(t[1]);
-			if (flag & 4) continue;
+			if ((flag & 4) || t[2] == '*' || t[5] == '*') continue;
 			if (flag & 0x100) {
 				++n_2nd;
 				continue;
diff --git a/mmpriv.h b/mmpriv.h
index 32b3fd1..3c40c28 100644
--- a/mmpriv.h
+++ b/mmpriv.h
@@ -40,9 +40,11 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
 void radix_sort_64(uint64_t *beg, uint64_t *end);
 uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
 
+void mm_write_sam_SQ(const mm_idx_t *idx);
+void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
 void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
 void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
-int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
+int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
 mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
 
 mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);
diff --git a/test/q2.fa b/test/q2.fa
new file mode 100644
index 0000000..99d63a2
--- /dev/null
+++ b/test/q2.fa
@@ -0,0 +1,2 @@
+>q2
+GGACATCCCGATGGTGCAGTCCTACCTGTACGAAAGGAC
diff --git a/test/t2.fa b/test/t2.fa
new file mode 100644
index 0000000..8c1d30b
--- /dev/null
+++ b/test/t2.fa
@@ -0,0 +1,2 @@
+>t2
+GGACATCCCGATGGTGCAGgtGCTATTAAAGGTTCGTTTGTTCAACGATTAAagTCCTACCTGTACGAAAGGAC
diff --git a/tex/minimap2.bib b/tex/minimap2.bib
index 75ace65..f6f03ec 100644
--- a/tex/minimap2.bib
+++ b/tex/minimap2.bib
@@ -179,3 +179,83 @@
 	note = {doi:10.1101/130633},
 	publisher = {Cold Spring Harbor Labs Journals},
 	journal = {bioRxiv}}
+
+@article{Gotoh:1982aa,
+	Author = {Gotoh, O},
+	Journal = {J Mol Biol},
+	Pages = {705-8},
+	Title = {An improved algorithm for matching biological sequences},
+	Volume = {162},
+	Year = {1982}}
+
+@article{Altschul:1986aa,
+	Author = {Altschul, S F and Erickson, B W},
+	Journal = {Bull Math Biol},
+	Pages = {603-16},
+	Title = {Optimal sequence alignment using affine gap costs},
+	Volume = {48},
+	Year = {1986}}
+
+@article{Wu:2005vn,
+	Author = {Wu, Thomas D and Watanabe, Colin K},
+	Journal = {Bioinformatics},
+	Pages = {1859-75},
+	Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences},
+	Volume = {21},
+	Year = {2005}}
+
+@article{Iwata:2012aa,
+	Author = {Iwata, Hiroaki and Gotoh, Osamu},
+	Journal = {Nucleic Acids Res},
+	Pages = {e161},
+	Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features},
+	Volume = {40},
+	Year = {2012}}
+
+@article{Dobin:2013kx,
+	Author = {Dobin, Alexander and others},
+	Journal = {Bioinformatics},
+	Pages = {15-21},
+	Title = {{STAR}: ultrafast universal {RNA-seq} aligner},
+	Volume = {29},
+	Year = {2013}}
+
+@article{Byrne:2017aa,
+	Author = {Byrne, Ashley and others},
+	Journal = {Nat Commun},
+	Pages = {16027},
+	Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells},
+	Volume = {8},
+	Year = {2017}}
+
+@article{Roberts:2004fv,
+	Author = {Roberts, Michael and others},
+	Journal = {Bioinformatics},
+	Pages = {3363-9},
+	Title = {Reducing storage requirements for biological sequence comparison},
+	Volume = {20},
+	Year = {2004}}
+
+@article{Zhang:2006aa,
+	Author = {Zhang, Miao and Gish, Warren},
+	Journal = {Bioinformatics},
+	Pages = {13-20},
+	Title = {Improved spliced alignment from an information theoretic approach},
+	Volume = {22},
+	Year = {2006}}
+
+@article{Li:2007aa,
+	Author = {Li, Heng and others},
+	Journal = {BMC Bioinformatics},
+	Pages = {349},
+	Title = {A cross-species alignment tool {(CAT)}},
+	Volume = {8},
+	Year = {2007}}
+
+@article{Farrar:2007hs,
+	Author = {Farrar, Michael},
+	Journal = {Bioinformatics},
+	Pages = {156-61},
+	Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}},
+	Volume = {23},
+	Year = {2007}}
diff --git a/tex/minimap2.tex b/tex/minimap2.tex
index cd20bf7..35090fb 100644
--- a/tex/minimap2.tex
+++ b/tex/minimap2.tex
@@ -20,7 +20,7 @@
 \begin{document}
 \firstpage{1}
 
-\title[Long DNA sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long DNA sequences}
+\title[Aligning long nucleotide sequences with minimap2]{Minimap2: fast pairwise alignment for long nucleotide sequences}
 \author[Li]{Heng Li}
 \address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
 
@@ -28,11 +28,14 @@
 
 \begin{abstract}
 \section{Summary:} Minimap2 is a general-purpose mapper to align long noisy DNA
-sequences against a large reference database. It targets query sequences of
-1kb--100Mb in length with per-base divergence typically below 25\%. Minimap2 is
-$\sim$30 times faster than many mainstream long-read aligners and achieves
-higher accuracy on simulated data. It also employs concave gap cost and rescues
-inversions for improved alignment around potential structural variations.
+or mRNA sequences against a large reference database. It targets query
+sequences of 1kb--100Mb in length with per-base divergence typically below
+25\%. For DNA sequence reads, minimap2 is $\sim$30 times faster than many
+mainstream long-read aligners and achieves higher accuracy on simulated data.
+It also employs concave gap cost and rescues inversions for improved alignment
+around potential structural variations. For real long RNA-seq reads, minimap2
+is $\sim$40 times faster than peers and produces alignment more consistent with
+existing gene annotations.
 
 \section{Availability and implementation:}
 \href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2}
@@ -46,43 +49,62 @@ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore
 technologies (ONT) produce reads over 10kbp in length at an error rate
 $\sim$15\%. Several aligners have been developed for such
 data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}.
-They are usually five times as slow as mainstream short-read
-aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be
-substantial room for speedup on the thought that 10kb long sequences should be
-easier to map than 100bp reads because we can more effectively skip repetitive
-regions, which are often the bottleneck of short-read alignment. We confirmed
-our speculation by achieving approximate mapping 50 times faster than
-BWA-MEM~\citep{Li:2016aa}.  \citet{Suzuki:2016} extended our work with a fast
-and novel algorithm on generating detailed alignment, which in turn inspired us
-to develop minimap2 towards higher accuracy and more practical functionality.
+Most of them were five times as slow as mainstream short-read
+aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases
+mapped per second. We speculated there could be substantial room for speedup on
+the thought that 10kb long sequences should be easier to map than 100bp reads
+because we can more effectively skip repetitive regions, which are often the
+bottleneck of short-read alignment. We confirmed our speculation by achieving
+approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
+\citet{Suzuki:2016} extended our work with a fast and novel algorithm on
+generating base-level alignment, which in turn inspired us to develop minimap2
+towards higher accuracy and more practical functionality.
+
+Both SMRT and ONT have been applied to sequence spliced mRNAs (RNA-seq). While
+traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not
+optimized for long noisy sequence reads and are tens of times slower than
+dedicated long-read aligners. When developing minimap2 initially for aligning
+genomic DNA only, we realized minor modifications could make it competitive for
+aligning mRNAs as well. Minimap2 is a first RNA-seq aligner specifically
+designed for long noisy reads.
 
 \begin{methods}
 \section{Methods}
 
-Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
-indexing and seeding algorithms, and furthers it with more accurate chaining
-and the ability to produce detailed alignment.
+Minimap2 follows a typical seed-chain-align procedure as is used by most
+full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
+reference sequences and indexes them in a hash table. Then for each query
+sequence, minimap2 takes query minimizers as \emph{seeds}, finds matches to the
+reference, and identifies sets of colinear seeds, which are called
+\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
+programming (DP) to extend from the ends of chains and to close unseeded
+regions between adjacent seeds in chains.
+
+Minimap2 uses indexing and seeding algorithms similar to
+minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate
+chaining, the ability to produce base-level alignment and the support of
+spliced alignment.
 
 \subsection{Chaining}
 
-%\subsubsection{Chaining}
+\subsubsection{Chaining}
 An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the
 reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors
 sorted by ending reference position $x$, let $f(i)$ be the maximal chaining
 score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
-dynamic programming (DP):
+dynamic programming:
 \begin{equation}\label{eq:chain}
-f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\}
+f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\}
 \end{equation}
-where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
-matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It
+where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
+matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It
 equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
 distance between two anchors is too large); otherwise
-\[
-\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
-\]
-In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
-l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
+\begin{equation}\label{eq:chain-gap}
+\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
+\end{equation}
+In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot
+|l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with
 Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
 chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
 are inapplicable to generic gap cost, complex to implement and usually
@@ -91,20 +113,20 @@ accelerate chaining.
 
 We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor
 of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
-we start from anchor $i-1$ and stop the evaluation if we cannot find a better
+we start from anchor $i-1$ and stop the process if we cannot find a better
 score after up to $h$ iterations. This approach reduces the average time to
 $O(h\cdot m)$. In practice, we can almost always find the optimal chain with
 $h=50$; even if the heuristic fails, the optimal chain is often close.
 
-%\subsubsection{Backtracking}
-For backtracking, let $P(i)$ be the index of the best predecessor of anchor
-$i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$
-otherwise. For each anchor $i$ in the descending order of $f(i)$, we apply
-$P(\cdot)$ repeatedly to find its predecessor and mark each visited $i$ as
-`used', until $P(i)=0$ or we reach an already `used' $i$. This way we find all
-chains with no anchors used in more than one chains.
+\subsubsection{Backtracking}
+Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
+$f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each
+anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to
+find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we
+reach an already `used' $i$. This way we find all chains with no anchors used
+in more than one chains.
 
-%\subsubsection{Identifying primary chains}
+\subsubsection{Identifying primary chains}
 In the absence of copy number changes, each query segment should not be mapped
 to two places in the reference. However, chains found at the previous step may
 have significant or complete overlaps due to repeats in the reference.
@@ -117,132 +139,22 @@ add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did
 not choose a more sophisticated data structure (e.g. range tree or k-d tree)
 because this step is not the performance bottleneck.
 
-\subsection{Alignment}
+\subsection{Aligning genomic DNA}
 
-Minimap2 performs global alignment between adjacent anchors in a chain. It
-adopted difference-based formulation to derive
-alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization,
-this formulation has two advantages. First, because each score in the DP matrix
-is bounded by the gap cost and the maximum matching score, we can usually
-achieve 16-way SSE vectorization regardless of the peak score of the
-alignment. Second, filling the DP matrix along the diagonal, we can simplify
-banded alignment, which is critical to performance. In practice, our
-implementation is three times as fast as Parasail's 4-way
-vectorization~\citep{Daily:2016aa} for global alignment.
-Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
-but with a 1000bp band, it is considerably faster. When performing global
-alignment between anchors, we expect the alignment to stay close to the
-diagonal of the DP matrix. Banding is often applicable.
+\subsubsection{Alignment with 2-piece affine gap cost}
 
-Minimap2 uses a 2-piece affine gap cost
-$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
-On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
-cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than
-$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
-$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
-longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}).
+Minimap2 performs DP-based global alignment between adjacent anchors in a
+chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
+\begin{equation}\label{eq:2-piece}
+\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\}
+\end{equation}
+Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$.
+On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps
+shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
+$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps.  This scheme helps to recover
+longer insertions and deletions~(INDELs).
 
-With global alignment, minimap2 may force to align unrelated sequences between
-two adjacent anchors. To avoid such an artifact, we compute accumulative
-alignment score along the alignment path and break the alignment where the
-score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
-the alignment score along the alignment path ending at cell $(i,j)$ in the DP
-matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
-$j'<j$, such that
-\[
-S(i',j')-S(i,j)>Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\})
-\]
-where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
-This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
-However, unlike X-drop, it would not break the alignment in the presence of a
-single long gap.
-
-When minimap2 breaks a global alignment between two anchors, it performs local
-alignment between the two subsequences involved in the global alignment, but
-this time with the one subsequence reverse complemented. This additional
-alignment step may identify short inversions that are missed during chaining.
-
-\end{methods}
-
-\section{Results}
-\begin{figure}[!tb]
-\centering
-\includegraphics[width=.5\textwidth]{roc-color.pdf}
-\caption{Evaluation on simulated SMRT reads aligned against human genome
-GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
-of mapping quality. 33,088 $\ge$1000bp reads were simulated using
-pbsim~\citep{Ono:2013aa} with error profile sampled from file
-`m131017\_060208\_42213\_*.1.*' downloaded at
-\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
-11,628. A read is considered correctly mapped if the true position overlaps
-with the best mapping position by 10\% of the read length. All aligners were
-run under the default setting for SMRT reads.}\label{fig:eval}
-\end{figure}
-
-As a sanity check, we evaluated minimap2 on simulated human reads along with
-BLASR~\citep{Chaisson:2012aa}, 
-BWA-MEM~\citep{Li:2013aa},
-GraphMap~\citep{Sovic:2016aa},
-minialign~\citep{Suzuki:2016} and
-NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
-LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
-crashed or produced malformatted output. In this evaluation, Minimap2 has
-higher power to distinguish unique and repetitive hits, and achieves overall
-higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
-even if we skip DP-based alignment (data not shown), suggesting chaining alone
-is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
-NGMLR provide better mapping quality estimate: they rarely give repetitive hits
-high mapping quality (Fig.~\ref{fig:eval}b).  Apparently, other aligners may
-occasionally miss close suboptimal hits and be overconfident in wrong mappings.
-On run time, minialign is slightly faster than minimap2. They are over 30 times
-faster than the rest.  Minimap2 consumed 6.1GB memory at the peak, more than
-BWA-MEM but less than others.
-
-On real human SMRT reads, the relative performance and sensitivity of
-these aligners are broadly similar to those on simulated data. We are unable to
-provide a good estimate of mapping error rate due to the lack of the truth.  On
-ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
-minimap2 are over 70 times faster than others. We have also examined tens of
-$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the
-observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into
-shorter gaps. Minimap2 does not have this issue.
-
-\section{Discussions}
-
-Minialign and minimap2 are fast because a) with chaining, they can quickly
-filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but
-costly DP-based alignments; b) they implemented so far the fastest DP-based
-alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to
-further accelerate minimap2 with a few other tweaks such as adaptive
-banding~\citep{Suzuki130633} or incremental banding.
-
-In addition to reference-based read mapping, minimap2 inherits minimap's
-ability to search against huge multi-species databases and to find read
-overlaps. On a few test data sets, minimap2 appears to yield slightly better
-miniasm assembly. Minimap2 can also align closely related genomes, though it
-would benefit from more thorough evaluations. Genome alignment is an intricate
-topic.
-
-\section*{Acknowledgements}
-We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
-insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
-and F.  Sedlazeck for pointing out the limitation of BWA-MEM. We are also
-grateful to early minimap2 testers who have greatly helped to fix various
-issues.
-
-\bibliography{minimap2}
-
-\pagebreak
-\appendix
-
-\begin{methods}
-\section*{Appendix}
-A 2-piece gap cost function is
-\[
-\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
-\]
-Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
-to compute the optimal alignment under such a gap cost is~\citep{Gotoh:1990aa}
+The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is
 \begin{equation}\label{eq:ae86}
 \left\{\begin{array}{l}
 H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
@@ -253,7 +165,20 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
 \end{array}\right.
 \end{equation}
 where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
-base. If we define
+base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine
+gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}.
+
+\subsubsection{Suzuki's formulation}
+
+When we allow gaps longer than several hundred base pairs, nucleotide-level
+alignment is much slower than chaining. SSE acceleration is critical to the
+performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
+based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
+sequences, but only 4-way parallelization when the peak alignment score reaches
+32767. Long sequence alignment may exceed this threshold. Inspired by
+\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a
+difference-based formulation that lifted this limitation.
+In case of 2-piece gap cost, define
 \[
 \left\{\begin{array}{ll}
 u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
@@ -261,7 +186,7 @@ x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\
 y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
 \end{array}\right.
 \]
-we can transform Eq.~(\ref{eq:ae86}) to
+We can transform Eq.~(\ref{eq:ae86}) to
 \begin{equation}\label{eq:suzuki}
 \left\{\begin{array}{lll}
 z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
@@ -276,7 +201,8 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
 \end{equation}
 where $z_{ij}$ is a temporary variable that does not need to be stored.
 
-All values in Eq.~(\ref{eq:suzuki}) are bounded. To see that,
+An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded
+by scoring parameters. To see that,
 \[
 x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
 \]
@@ -294,19 +220,19 @@ matching score, we can derive
 \[
 u_{ij}\le M-v_{i-1,j}\le M+q+e
 \]
-In conclusion, $x$ and $y$ are bounded by $[-q-e,-e]$, $\tilde{x}$ and $\tilde{y}$ by
-$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
-matching score and gap cost are small, each of them can be stored as a 8-bit
-integer. This enables 16-way SSE vectorization regardless of the peak score of
-the alignment.
+In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$,
+$\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and
+$v$ by $[-q-e,M+q+e]$. When $-128\le-q-e<M+q+e\le127$, each of them can be stored as
+a 8-bit integer. This enables 16-way SSE vectorization regardless of the peak
+score of the alignment.
 
 For a more efficient SSE implementation, we transform the row-column coordinate
-to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
+to the diagonal-antidiagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
 Eq.~(\ref{eq:suzuki}) becomes:
 \begin{equation*}
 \left\{\begin{array}{lll}
-z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
-&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
+z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}\\
+&&+u_{r-1,t},\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
 u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
 v_{rt}&=&z_{rt}-u_{r-1,t}\\
 x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
@@ -317,10 +243,11 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
 \end{equation*}
 In this formulation, cells with the same diagonal index $r$ are independent of
 each other. This allows us to fully vectorize the computation of all cells on
-the same anti-diagonal in one inner loop.
+the same anti-diagonal in one inner loop. It also simplifies banded alignment,
+which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
 
-On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
-condition of this equation in the diagonal-anti-diagonal coordinate is
+On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
+values in the diagonal-antidiagonal formuation is
 \[
 \left\{\begin{array}{l}
 x_{r-1,-1}=y_{r-1,r}=-q-e\\
@@ -337,8 +264,246 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til
 -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
 \end{array}\right.
 \]
+These can be derived from the initial values for Eq.~(\ref{eq:ae86}).
+
+In practice, our 16-way vectorized implementation of global alignment is three
+times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}.  Without
+banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
+a 1000bp band, it is considerably faster. When performing global alignment
+between anchors, we expect the alignment to stay close to the diagonal of the
+DP matrix. Banding is applicable most of time.
+
+\subsubsection{The Z-drop heuristic}
+
+With global alignment, minimap2 may force to align unrelated sequences between
+two adjacent anchors. To avoid such an artifact, we compute accumulative
+alignment score along the alignment path and break the alignment where the
+score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
+the alignment score along the alignment path ending at cell $(i,j)$ in the DP
+matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
+$j'<j$, such that
+\[
+S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')|
+\]
+where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
+This strategy is first used in BWA-MEM. It is similar to X-drop employed in
+BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the
+alignment in the presence of a single long gap.
+
+When minimap2 breaks a global alignment between two anchors, it performs local
+alignment between the two subsequences involved in the global alignment, but
+this time with the one subsequence reverse complemented. This additional
+alignment step may identify short inversions that are missed during chaining.
+
+\subsection{Aligning spliced sequences}
+
+The algorithm described above can be adapted to spliced alignment. In this
+mode, the chaining gap cost distinguishes insertions to and deletions from the
+reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of
+\[
+\gamma_c(l)=\left\{\begin{array}{ll}
+0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\
+\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0)
+\end{array}\right.
+\]
+Similarly, the gap cost function used for DP-based alignment is changed to
+\[
+\gamma_a(l)=\left\{\begin{array}{ll}
+q+l\cdot e & (l>0) \\
+\min\{q+|l|\cdot e,\tilde{q}\} & (l<0)
+\end{array}\right.
+\]
+In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is
+regarded as an intron, which pays no cost to gap extensions.
+
+To pinpoint precise splicing junctions, minimap2 introduces reference-dependent
+cost to penalize non-canonical splicing:
+\begin{equation}\label{eq:splice}
+\left\{\begin{array}{l}
+H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\
+E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
+F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
+\tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\
+\end{array}\right.
+\end{equation}
+Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor
+site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$
+otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which
+takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is
+almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except
+that we allow insertions immediately followed by deletions and vice versa; in
+addition, we use Suzuki's diagonal formulation in actual implementation.
+
+%Given that $d_i$ and $a_i$
+%are a function of the reference sequence, it is possible to incorporate
+%splicing signals with more sophisticated models, such as positional weight
+%matrices. We have not tried this approach.
+
+If RNA-seq reads are not sequenced from stranded libraries, the read strand
+relative to the underlying transcript is unknown. By default, minimap2 aligns
+each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal
+and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt
+GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is
+taken as the final alignment. This procedure also infers the relative strand of
+reads that span canonical splicing sites.
+
+In the spliced alignment mode, minimap2 further increases the density of
+minimizers and disables banded alignment. Together with the two-round DP-based
+alignment, spliced alignment is several times slower than DNA sequence
+alignment.
 
-\citet{Suzuki:2016} first derived a similar set of equations under affine gap
-cost but with different notations.
 \end{methods}
+
+\section{Results}
+
+\subsection{Aligning genomic reads}
+
+\begin{figure}[!tb]
+\centering
+\includegraphics[width=.5\textwidth]{roc-color.pdf}
+\caption{Evaluation on simulated SMRT reads aligned against human genome
+GRCh38.  33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa}
+with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded
+at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length
+is 11,628. A read is considered correctly mapped if the true position overlaps
+with the best mapping position by 10\% of the read length. All aligners were
+run under the default setting for SMRT reads.  (a) ROC-like curve. Alignments
+are sorted by mapping quality in the descending order. For each mapping quality
+threshold, the fraction of alignments with mapping quality above the threshold
+and their error rate are plotted.  Kart outputted all alignments at mapping
+quality 60, so is not shown in the figure. It mapped nearly all reads with
+4.1\% of alignments being wrong, less accurate than others.  (b) Accumulative
+mapping error rate as a function of mapping quality.}\label{fig:eval}
+\end{figure}
+
+As a sanity check, we evaluated minimap2 on simulated human reads along with
+BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}),
+BWA-MEM~(v0.7.15; \citealp{Li:2013aa}),
+GraphMap~(v0.5.2; \citealp{Sovic:2016aa}),
+Kart~(v2.2.5; \citealp{Lin:2017aa}),
+minialign~(v0.5.3; \citealp{Suzuki:2016}) and
+NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab}
+and LAMSA~\citep{Liu:2017aa} because they either
+crashed or produced malformatted output. In this evaluation, minimap2 has
+higher power to distinguish unique and repetitive hits, and achieves overall
+higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
+even if we skip DP-based alignment (data not shown), confirming chaining alone
+is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
+NGMLR provide better mapping quality estimate: they rarely give repetitive hits
+high mapping quality (Fig.~\ref{fig:eval}b).  Apparently, other aligners may
+occasionally miss close suboptimal hits and be overconfident in wrong mappings.
+On run time, minialign is slightly faster than minimap2 and Kart. They are over
+30 times faster than the rest.  Minimap2 consumed 6.1GB memory at the peak,
+more than BWA-MEM but less than others.
+
+On real human SMRT reads, the relative performance and sensitivity of
+these aligners are broadly similar to the metrics on simulated data. We are
+unable to provide a good estimate of mapping error rate due to the lack of the
+truth.  On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed.
+Kart, minialign and minimap2 are over 70 times faster than others. We have also
+examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can
+confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks
+them into shorter gaps. The issue is much alleviated with minimap2, thanks
+to the 2-piece affine gap cost.
+
+\subsection{Aligning spliced reads}
+
+We evaluated minimap2 on SIRV control data~(AC:SRR5286959;
+\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916
+introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined
+wrongly predicted junctions and found the majority were caused by clustered
+splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing
+errors are frequent, it is difficult to find precise splicing sites in this
+case. If we allow up to 10bp distance from true splicing sites, 98.4\% of
+aligned introns are approximately correct.  Given this observation, we might be
+able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in
+Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more
+sophisticated models. We have not tried this approach.
+
+\begin{table}[!tb]
+\processtable{Evaluation of junction accuracy on 2D ONT reads}
+{\footnotesize\label{tab:intron}
+\begin{tabular}{p{3.1cm}rrrr}
+\toprule
+& GMAP & minimap2 & SpAln & STAR\\
+\midrule
+Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\
+Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
+\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\
+\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\
+\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\
+\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\
+\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\
+\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\
+\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\
+\botrule
+\end{tabular}
+}{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse
+genome GRCm38 with the following tools and command options: minimap2 (`-ax
+splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
+-S3'); STARlong (according to
+\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were
+compared to the EnsEMBL gene annotation, release 89. A predicted intron
+is \emph{novel} if it has no overlaps with any annotated introns. An intron
+is \emph{exact} if it is identical to an annotated intron. An intron is
+\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
+of an annotated intron.}
+\end{table}
+
+We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
+\citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and
+STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more
+consistent with existing annotations (Table~\ref{tab:intron}): it finds
+more junctions with a higher percentage being exactly or approximately correct.
+Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to
+minimap2 in speed, it does not work well with noisy reads.  We have also
+evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain
+from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation
+is similar: minimap2 is faster at higher junction accuracy.
+
+We noted that GMAP and SpAln have not been optimized for noisy reads. We are
+showing the best setting we have experimented, but their developers should be
+able to improve their accuracy further.
+
+%\begin{table}[!tb]
+%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads}
+%{\footnotesize
+%\begin{tabular}{lrrrr}
+%\toprule
+%& GMAP & minimap2 & SpAln & STAR\\
+%\midrule
+%Run time (CPU min) & & 243 & 2\,352 & 1\,647 \\
+%\# aligned reads & & 1\,123\,025 & 1\,094\,092 & 682\,452\\
+%\# chimeric alignments & & 33\,091 & 0 & 0\\
+%\# non-spliced alignments & & 339\,081 & 291\,447 & 272\,536\vspace{1em}\\
+%\# aligned introns & & 9\,071\,755 & 9\,208\,564 & 3\,029\,121 \\
+%\# novel introns & & 42\,773 & 82\,230 & 17\,791 \\
+%\% exact introns & & 94.9\% & 91.7\% & 84.7\% \\
+%\% approx. introns&& 96.9\% & 93.4\% & 93.8\% \\
+%\botrule
+%\end{tabular}
+%}{}
+%\end{table}
+
+
+\section{Conclusion}
+
+Minimap2 is a fast, accurate and versatile aligner for long nucleotide
+sequences.  In addition to reference-based read mapping, minimap2 inherits
+minimap's functionality to search against huge multi-species databases and to
+find read overlaps. On a few test data sets, minimap2 appears to yield slightly
+better miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align similar
+genomes or different assemblies of the same species. However, full-genome
+alignment is an intricate research topic. More thorough evaluations would be
+necessary to justify the use of minimap2 for such applications.
+
+\section*{Acknowledgements}
+We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
+insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
+and F.  Sedlazeck for pointing out the limitation of BWA-MEM. We are also
+grateful to early minimap2 testers who have greatly helped to suggest features
+and to fix various issues.
+
+\bibliography{minimap2}
+
 \end{document}