Merge branch 'master' into short

This commit is contained in:
Heng Li 2017-08-28 07:02:22 +08:00
commit c4080aaf7e
23 changed files with 1487 additions and 322 deletions

View File

@ -2,8 +2,8 @@ CC= gcc
CFLAGS= -g -Wall -O2 -Wc++-compat
CPPFLAGS= -DHAVE_KALLOC
INCLUDES= -I.
OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_ll_sse.o misc.o bseq.o \
sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ksw2_ll_sse.o \
misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
@ -45,11 +45,12 @@ align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
bseq.o: bseq.h kseq.h
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
example.o: minimap.h kseq.h
format.o: mmpriv.h minimap.h bseq.h
format.o: kalloc.h mmpriv.h minimap.h bseq.h
hit.o: mmpriv.h minimap.h bseq.h kalloc.h
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
kalloc.o: kalloc.h
ksw2_extd2_sse.o: ksw2.h kalloc.h
ksw2_exts2_sse.o: ksw2.h kalloc.h
ksw2_extz2_sse.o: ksw2.h kalloc.h
ksw2_ll_sse.o: ksw2.h kalloc.h
main.o: bseq.h minimap.h mmpriv.h

38
NEWS.md
View File

@ -1,3 +1,41 @@
Release 2.1-r311 (25 August 2017)
---------------------------------
This release adds spliced alignment for long noisy RNA-seq reads. On a SMRT
Iso-Seq and a Oxford Nanopore data sets, minimap2 appears to outperform
traditional mRNA aligners. For DNA alignment, this release gives almost
identical output to v2.0. Other changes include:
* Added option `-R` to set the read group header line in SAM.
* Optionally output the `cs:Z` tag in PAF to encode both the query and the
reference sequences in the alignment.
* Fixed an issue where DP alignment uses excessive memory.
The minimap2 technical report has been updated with more details and the
evaluation of spliced alignment:
* Li, H. (2017). Minimap2: fast pairwise alignment for long nucleotide
sequences. [arXiv:1708.01492v2](https://arxiv.org/abs/1708.01492v2).
(2.1: 25 August 2017, r311)
Release 2.0-r275 (8 August 2017)
--------------------------------
This release is identical to version 2.0rc1, except the version number. It is
described and evaluated in the following technical report:
* Li, H. (2017). Minimap2: fast pairwise alignment for long DNA sequences.
[arXiv:1708.01492v1](https://arxiv.org/abs/1708.01492v1).
(2.0: 8 August 2017, r275)
Release 2.0rc1-r232 (30 July 2017)
----------------------------------

View File

@ -1,3 +1,4 @@
[![Build Status](https://travis-ci.org/lh3/minimap2.svg?branch=master)](https://travis-ci.org/lh3/minimap2)
## Getting Started
```sh
git clone https://github.com/lh3/minimap2
@ -9,6 +10,8 @@ cd minimap2 && make
./minimap2 -ax map10k MT-human.mmi test/MT-orang.fa > test.sam
# long-read overlap (no test data)
./minimap2 -x ava-pb your-reads.fa your-reads.fa > overlaps.paf
# spliced alignment (no test data)
./minimap2 -ax splice ref.fa rna-seq-reads.fa > spliced.sam
# man page
man ./minimap2.1
```

74
align.c
View File

@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
} else if (op == 1) {
score -= q + e * len, j += len;
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
} else if (op == 2) {
} else if (op == 2 || op == 3) {
score -= q + e * len, i += len;
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
}
@ -64,11 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
{
uint32_t k, l, toff = 0, qoff = 0;
int32_t s = 0, max = 0;
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
if (p == 0) return;
for (k = 0; k < p->n_cigar; ++k) {
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
if (op == 0) {
if (op == 0) { // match/mismatch
for (l = 0; l < len; ++l) {
int cq = qseq[qoff + l], ct = tseq[toff + l];
if (ct > 3 || cq > 3) ++p->n_ambi;
@ -78,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
else max = max > s? max : s;
}
toff += len, qoff += len, p->blen += len;
} else if (op == 1) {
} else if (op == 1) { // insertion
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (qseq[qoff + l] > 3) ++n_ambi;
@ -86,7 +86,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 2) {
} else if (op == 2) { // deletion
int n_ambi = 0;
for (l = 0; l < len; ++l)
if (tseq[toff + l] > 3) ++n_ambi;
@ -94,9 +94,18 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
s -= q + e * len;
if (s < 0) s = 0;
} else if (op == 3) { // intron
uint8_t b[4];
b[0] = tseq[toff], b[1] = tseq[toff+1];
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
toff += len, p->blen += len;
}
}
p->dp_max = max;
if (n_gtag > n_ctac) p->trans_strand = 1;
else if (n_gtag < n_ctac) p->trans_strand = 2;
}
static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
@ -132,7 +141,9 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
}
if (opt->q == opt->q2 && opt->e == opt->e2)
if (opt->flag & MM_F_SPLICE)
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez);
else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
else
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez);
@ -159,8 +170,8 @@ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2],
c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
*r = (int32_t)a->x + 1 - c;
} else {
*r = (int32_t)a->x + 1;
*q = (int32_t)a->y + 1;
*r = (int32_t)a->x - (mi->k>>1);
*q = (int32_t)a->y - (mi->k>>1);
}
}
@ -240,11 +251,11 @@ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int32_
}
}
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez)
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez, int splice_flag)
{
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
uint8_t *tseq, *qseq;
int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0;
int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
int32_t rs, re, qs, qe;
int32_t rs1, qs1, re1, qe1;
int8_t mat[25];
@ -254,11 +265,19 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw = (int)(opt->bw * 1.5 + 1.);
r2->cnt = 0;
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
if (!(opt->flag & MM_F_SPLICE))
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
else as1 = r->as, cnt1 = r->cnt;
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
if (opt->flag & MM_F_SPLICE) {
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
if (splice_flag & MM_F_SPLICE_BOTH) extra_flag |= KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV;
}
// compute rs0 and qs0
if (r->split && as1 > 0) {
mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0);
@ -291,7 +310,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs0, rs, tseq);
mm_seq_rev(qs - qs0, qseq);
mm_seq_rev(rs - rs0, tseq);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;
@ -313,9 +332,9 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw1 = qe - qs > re - rs? qe - qs : re - rs;
qseq = &qseq0[rev][qs];
mm_idx_getseq(mi, rid, rs, re, tseq);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
@ -338,7 +357,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (!dropped && qe < qe0 && re < re0) { // right extension
qseq = &qseq0[rev][qe];
mm_idx_getseq(mi, rid, re, re0, tseq);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;
@ -356,6 +375,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand
}
kfree(km, tseq);
@ -438,7 +459,28 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
memset(&ez, 0, sizeof(ksw_extz_t));
for (i = 0; i < n_regs; ++i) {
mm_reg1_t r2;
mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, a, &ez);
if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) {
mm_reg1_t s[2], s2[2];
int which, trans_strand;
s[0] = s[1] = regs[i];
mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], a, &ez, MM_F_SPLICE_FOR);
mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], a, &ez, MM_F_SPLICE_REV);
if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
if (which == 0) {
regs[i] = s[0], r2 = s2[0];
free(s[1].p);
} else {
regs[i] = s[1], r2 = s2[1];
free(s[0].p);
}
regs[i].p->trans_strand = trans_strand;
} else {
mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, a, &ez, opt->flag);
if ((opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SPLICE_BOTH))
regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
}
if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
regs = mm_insert_reg(&r2, i, &n_regs, regs);

4
bseq.c
View File

@ -8,7 +8,6 @@
KSEQ_INIT(gzFile, gzread)
struct mm_bseq_file_s {
int is_eof;
gzFile fp;
kseq_t *ks;
};
@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
size += seqs[n++].l_seq;
if (size >= chunk_size) break;
}
if (size < chunk_size) fp->is_eof = 1;
*n_ = n;
return seqs;
}
int mm_bseq_eof(mm_bseq_file_t *fp)
{
return fp->is_eof;
return ks_eof(fp->ks->f);
}

14
chain.c
View File

@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km)
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km)
{ // TODO: make sure this works when n has more than 32 bits
int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v;
int64_t i, j;
@ -42,17 +42,23 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
uint64_t ri = a[i].x;
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX;
while (st < i && ri - a[st].x > max_dist) ++st;
while (st < i && ri - a[st].x > max_dist_x) ++st;
for (j = i - 1; j >= st; --j) {
int64_t dr = ri - a[j].x;
int32_t dq = qi - (int32_t)a[j].y, dd, sc;
if (dr == 0 || dq <= 0 || dq > max_dist) continue;
if (dr == 0 || dq <= 0 || dq > max_dist_y) continue;
dd = dr > dq? dr - dq : dq - dr;
if (dd > bw) continue;
max_f_past = max_f_past > f[j]? max_f_past : f[j];
min_d = dq < dr? dq : dr;
sc = min_d > q_span? q_span : dq < dr? dq : dr;
sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
if (is_cdna) {
int c_log, c_lin;
c_lin = (int)(dd * .01 * avg_qspan);
c_log = ilog2_32(dd);
if (dr > dq) sc -= c_lin < c_log? c_lin : c_log;
else sc -= c_lin + (c_log>>1);
} else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;

View File

@ -6,6 +6,8 @@
#include "kalloc.h"
#include "mmpriv.h"
static char mm_rg_id[256];
static inline void str_enlarge(kstring_t *s, int l)
{
if (s->l + l + 1 > s->m) {
@ -56,6 +58,70 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
s->s[s->l] = 0;
}
static char *mm_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
static void sam_write_rg_line(kstring_t *str, const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(mm_rg_id, 0, 256);
if (s == 0) return;
if (strstr(s, "@RG") != s) {
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
goto err_set_rg;
}
if (strstr(s, "\t") != NULL) {
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
goto err_set_rg;
}
rg_line = strdup(s);
mm_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
goto err_set_rg;
}
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
mm_sprintf_lite(str, "%s\n", rg_line);
err_set_rg:
free(rg_line);
}
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
{
kstring_t str = {0,0,0};
sam_write_rg_line(&str, rg);
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
if (argc > 1) {
int i;
mm_sprintf_lite(&str, "\tCL:minimap2");
for (i = 1; i < argc; ++i)
mm_sprintf_lite(&str, " %s", argv[i]);
}
mm_sprintf_lite(&str, "\n");
fputs(str.s, stdout);
free(str.s);
}
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
{
extern unsigned char seq_nt4_table[256];
@ -119,7 +185,11 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
if (r->p) {
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
}
}
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
@ -137,7 +207,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
uint32_t k;
mm_sprintf_lite(s, "\tcg:Z:");
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
}
if (r->p && (opt_flag & MM_F_OUT_CS))
write_cs(km, s, mi, t, r);
@ -154,6 +224,13 @@ static char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
};
void mm_write_sam_SQ(const mm_idx_t *idx)
{
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
{
if (rev) {
@ -187,7 +264,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
int clip_char = (flag&0x800)? 'H' : 'S';
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
clip_len = r->rev? r->qs : t->l_seq - r->qe;
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
} else mm_sprintf_lite(s, "*");
@ -206,6 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
else mm_sprintf_lite(s, "*");
}
write_tags(s, r);
if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
int i, n_sa = 0; // n_sa: number of SA fields
for (i = 0; i < n_regs; ++i)

View File

@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
{
pipeline_t pl;
if (fp == 0 || mm_bseq_eof(fp)) return 0;
memset(&pl, 0, sizeof(pipeline_t));
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
pl.keep_name = keep_name;
pl.batch_size = batch_size;
pl.fp = fp;
if (pl.fp == 0) return 0;
pl.mi = mm_idx_init(w, k, b, is_hpc);
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);

11
ksw2.h
View File

@ -12,6 +12,8 @@
#define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse
#define KSW_EZ_EXTZ_ONLY 0x40 // only perform extension
#define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output
#define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200
#ifdef __cplusplus
extern "C" {
@ -53,6 +55,9 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
/**
@ -106,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
{ // p[] - lower 3 bits: which type gets the max; bit
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
uint32_t *cigar = *cigar_, tmp;
@ -127,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
}
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion

View File

@ -1,5 +1,6 @@
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "ksw2.h"
#ifdef __SSE2__
@ -66,7 +67,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (w < 0) w = tlen > qlen? tlen : qlen;
wl = wr = w;
tlen_ = (tlen + 15) / 16;
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
n_col_ = qlen < tlen? qlen : tlen;
n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
qlen_ = (qlen + 15) / 16;
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
max_sc = max_sc > mat[t]? max_sc : mat[t];
@ -161,6 +163,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
x21_ = _mm_cvtsi32_si128((uint8_t)x21);
v1_ = _mm_cvtsi32_si128((uint8_t)v1);
st_ = st / 16, en_ = en / 16;
assert(en_ - st_ + 1 <= n_col_);
if (!with_cigar) { // score only
for (t = st_; t <= en_; ++t) {
__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
@ -362,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}

357
ksw2_exts2_sse.c 100644
View File

@ -0,0 +1,357 @@
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include "ksw2.h"
#ifdef __SSE2__
#include <emmintrin.h>
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
{
#define __dp_code_block1 \
z = _mm_load_si128(&s[t]); \
xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \
tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \
xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
x1_ = tmp; \
vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \
tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \
vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
v1_ = tmp; \
a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \
b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
x2t1= _mm_load_si128(&x2[t]); \
tmp = _mm_srli_si128(x2t1, 15); \
x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
x21_= tmp; \
a2 = _mm_add_epi8(x2t1, vt1); \
a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t]));
#define __dp_code_block2 \
_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
_mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
tmp = _mm_sub_epi8(z, q_); \
a = _mm_sub_epi8(a, tmp); \
b = _mm_sub_epi8(b, tmp); \
a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
int32_t *H = 0, H0 = 0, last_H0_t = 0;
uint8_t *qr, *sf, *mem, *mem2 = 0;
__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, m1_;
__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
ksw_reset_extz(ez);
if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
zero_ = _mm_set1_epi8(0);
q_ = _mm_set1_epi8(q);
q2_ = _mm_set1_epi8(q2);
qe_ = _mm_set1_epi8(q + e);
sc_mch_ = _mm_set1_epi8(mat[0]);
sc_mis_ = _mm_set1_epi8(mat[1]);
m1_ = _mm_set1_epi8(m - 1); // wildcard
tlen_ = (tlen + 15) / 16;
n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1;
qlen_ = (qlen + 15) / 16;
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
max_sc = max_sc > mat[t]? max_sc : mat[t];
min_sc = min_sc < mat[t]? min_sc : mat[t];
}
if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
long_thres = (q2 - q) / e - 1;
if (q2 > q + e + long_thres * e)
++long_thres;
long_diff = long_thres * e - (q2 - q);
mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16);
u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_;
donor = x2 + tlen_, acceptor = donor + tlen_;
s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
memset(u, -q - e, tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array)
memset(x2, -q2, tlen_ * 16);
if (!approx_max) {
H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
}
if (with_cigar) {
mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
off_end = off + qlen + tlen - 1;
}
for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
memcpy(sf, target, tlen);
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
memset(donor, -noncan, tlen_ * 16);
for (t = 0; t < tlen - 2; ++t) {
int is_can = 0; // is a canonical site
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1;
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1;
if (is_can) ((int8_t*)donor)[t] = 0;
}
memset(acceptor, -noncan, tlen_ * 16);
for (t = 2; t < tlen; ++t) {
int is_can = 0;
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1;
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1;
if (is_can) ((int8_t*)acceptor)[t] = 0;
}
}
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
int st = 0, en = tlen - 1, st0, en0, st_, en_;
int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v;
uint8_t *qrr = qr + (qlen - 1 - r);
__m128i x1_, x21_, v1_;
// find the boundaries
if (st < r - qlen + 1) st = r - qlen + 1;
if (en > r) en = r;
st0 = st, en0 = en;
st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
// set boundary conditions
if (st > 0) {
if (st - 1 >= last_st && st - 1 <= last_en)
x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
else x1 = -q - e, x21 = -q2, v1 = -q - e;
} else {
x1 = -q - e, x21 = -q2;
v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
}
if (en >= r) {
((int8_t*)y)[r] = -q - e;
u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
}
// loop fission: set scores first
if (!(flag & KSW_EZ_GENERIC_SC)) {
for (t = st0; t <= en0; t += 16) {
__m128i sq, st, tmp, mask;
sq = _mm_loadu_si128((__m128i*)&sf[t]);
st = _mm_loadu_si128((__m128i*)&qrr[t]);
mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
tmp = _mm_cmpeq_epi8(sq, st);
#ifdef __SSE4_1__
tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
#else
tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
#endif
tmp = _mm_andnot_si128(mask, tmp);
_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
}
} else {
for (t = st0; t <= en0; ++t)
((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
}
// core loop
x1_ = _mm_cvtsi32_si128((uint8_t)x1);
x21_ = _mm_cvtsi32_si128((uint8_t)x21);
v1_ = _mm_cvtsi32_si128((uint8_t)v1);
st_ = st / 16, en_ = en / 16;
assert(en_ - st_ + 1 <= n_col_);
if (!with_cigar) { // score only
for (t = st_; t <= en_; ++t) {
__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
__dp_code_block1;
#ifdef __SSE4_1__
z = _mm_max_epi8(z, a);
z = _mm_max_epi8(z, b);
z = _mm_max_epi8(z, a2a);
__dp_code_block2; // save u[] and v[]; update a, b and a2
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_));
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
tmp = _mm_load_si128(&donor[t]);
_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
#else
tmp = _mm_cmpgt_epi8(a, z);
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
tmp = _mm_cmpgt_epi8(b, z);
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
tmp = _mm_cmpgt_epi8(a2a, z);
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
__dp_code_block2;
tmp = _mm_cmpgt_epi8(a, zero_);
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
tmp = _mm_cmpgt_epi8(b, zero_);
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
tmp = _mm_cmpgt_epi8(a2, tmp);
tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
#endif
}
} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
__m128i *pr = p + r * n_col_ - st_;
off[r] = st, off_end[r] = en;
for (t = st_; t <= en_; ++t) {
__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
__dp_code_block1;
#ifdef __SSE4_1__
d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0
z = _mm_max_epi8(z, a);
d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d
z = _mm_max_epi8(z, b);
d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
z = _mm_max_epi8(z, a2a);
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
tmp = _mm_cmpgt_epi8(a, z);
d = _mm_and_si128(tmp, _mm_set1_epi8(1));
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
tmp = _mm_cmpgt_epi8(b, z);
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
tmp = _mm_cmpgt_epi8(a2a, z);
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
#endif
__dp_code_block2;
tmp = _mm_cmpgt_epi8(a, zero_);
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
tmp = _mm_cmpgt_epi8(b, zero_);
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
tmp2 = _mm_load_si128(&donor[t]);
tmp = _mm_cmpgt_epi8(a2, tmp2);
#ifdef __SSE4_1__
tmp2 = _mm_max_epi8(a2, tmp2);
#else
tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
#endif
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
_mm_store_si128(&pr[t], d);
}
} else { // gap right-alignment
__m128i *pr = p + r * n_col_ - st_;
off[r] = st, off_end[r] = en;
for (t = st_; t <= en_; ++t) {
__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
__dp_code_block1;
#ifdef __SSE4_1__
d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1
z = _mm_max_epi8(z, a);
d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2
z = _mm_max_epi8(z, b);
d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
z = _mm_max_epi8(z, a2a);
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
tmp = _mm_cmpgt_epi8(z, a);
d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
tmp = _mm_cmpgt_epi8(z, b);
d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
tmp = _mm_cmpgt_epi8(z, a2a);
d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
#endif
__dp_code_block2;
tmp = _mm_cmpgt_epi8(zero_, a);
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_));
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
tmp = _mm_cmpgt_epi8(zero_, b);
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_));
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
tmp2 = _mm_load_si128(&donor[t]);
tmp = _mm_cmpgt_epi8(tmp2, a2);
#ifdef __SSE4_1__
tmp2 = _mm_max_epi8(tmp2, a2);
#else
tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
#endif
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
_mm_store_si128(&pr[t], d);
}
}
if (!approx_max) { // find the exact max with a 32-bit score array
int32_t max_H, max_t;
// compute H[], max_H and max_t
if (r > 0) {
int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
__m128i max_H_, max_t_;
max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
max_t = en0;
max_H_ = _mm_set1_epi32(max_H);
max_t_ = _mm_set1_epi32(max_t);
for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
__m128i H1, tmp, t_;
H1 = _mm_loadu_si128((__m128i*)&H[t]);
t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
H1 = _mm_add_epi32(H1, t_);
_mm_storeu_si128((__m128i*)&H[t], H1);
t_ = _mm_set1_epi32(t);
tmp = _mm_cmpgt_epi32(H1, max_H_);
#ifdef __SSE4_1__
max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
#else
max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
#endif
}
_mm_storeu_si128((__m128i*)HH, max_H_);
_mm_storeu_si128((__m128i*)tt, max_t_);
for (i = 0; i < 4; ++i)
if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
H[t] += (int32_t)v8[t];
if (H[t] > max_H)
max_H = H[t], max_t = t;
}
} else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
// update ez
if (en0 == tlen - 1 && H[en0] > ez->mte)
ez->mte = H[en0], ez->mte_q = r - en;
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
ez->mqe = H[st0], ez->mqe_t = st0;
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break;
if (r == qlen + tlen - 2 && en0 == tlen - 1)
ez->score = H[tlen - 1];
} else { // find approximate max; Z-drop might be inaccurate, too.
if (r > 0) {
if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
int32_t d0 = v8[last_H0_t];
int32_t d1 = u8[last_H0_t + 1];
if (d0 > d1) H0 += d0;
else H0 += d1, ++last_H0_t;
} else if (last_H0_t >= st0 && last_H0_t <= en0) {
H0 += v8[last_H0_t];
} else {
++last_H0_t, H0 += u8[last_H0_t];
}
} else H0 = v8[0] - qe, last_H0_t = 0;
if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, 0)) break;
if (r == qlen + tlen - 2 && en0 == tlen - 1)
ez->score = H0;
}
last_st = st, last_en = en;
//for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
}
kfree(km, mem);
if (!approx_max) kfree(km, H);
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}
#endif // __SSE2__

View File

@ -1,4 +1,5 @@
#include <string.h>
#include <assert.h>
#include "ksw2.h"
#ifdef __SSE2__
@ -58,7 +59,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (w < 0) w = tlen > qlen? tlen : qlen;
wl = wr = w;
tlen_ = (tlen + 15) / 16;
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
n_col_ = qlen < tlen? qlen : tlen;
n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
qlen_ = (qlen + 15) / 16;
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
max_sc = max_sc > mat[t]? max_sc : mat[t];
@ -130,6 +132,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
x1_ = _mm_cvtsi32_si128(x1);
v1_ = _mm_cvtsi32_si128(v1);
st_ = st / 16, en_ = en / 16;
assert(en_ - st_ + 1 <= n_col_);
if (!with_cigar) { // score only
for (t = st_; t <= en_; ++t) {
__m128i z, a, b, xt1, vt1, ut, tmp;
@ -275,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
if (with_cigar) { // backtrack
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
else if (ez->max_t >= 0 && ez->max_q >= 0)
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
kfree(km, mem2); kfree(km, off);
}
}

178
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0rc1-r271-dirty"
#define MM_VERSION "2.1-r311"
void liftrlimit()
{
@ -31,6 +31,11 @@ static struct option long_options[] = {
{ "max-chain-skip", required_argument, 0, 0 },
{ "min-dp-len", required_argument, 0, 0 },
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "no-sam-sq", no_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' },
{ "min-chain-score",required_argument, 0, 'm' },
@ -40,30 +45,42 @@ static struct option long_options[] = {
{ 0, 0, 0, 0}
};
static inline int64_t mm_parse_num(const char *str)
{
double x;
char *p;
x = strtod(optarg, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
return (int64_t)(x + .499);
}
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
int minibatch_size = 200000000;
uint64_t batch_size = 4000000000ULL;
mm_bseq_file_t *fp = 0;
char *fnw = 0, *s;
FILE *fpr = 0, *fpw = 0;
char *fnw = 0, *rg = 0, *s;
FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
liftrlimit();
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = atoi(optarg);
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
else if (c == 'f') opt.mid_occ_frac = atof(optarg);
else if (c == 't') n_threads = atoi(optarg);
else if (c == 'v') mm_verbose = atoi(optarg);
else if (c == 'g') opt.max_gap = atoi(optarg);
else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg);
else if (c == 'N') opt.best_n = atoi(optarg);
else if (c == 'p') opt.pri_ratio = atof(optarg);
else if (c == 'M') opt.mask_level = atof(optarg);
@ -79,6 +96,10 @@ int main(int argc, char *argv[])
else if (c == 'B') opt.b = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 'R') rg = optarg;
else if (c == 'h') fp_help = stdout;
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
@ -88,24 +109,28 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
else if (c == 'V') {
puts(MM_VERSION);
return 0;
} else if (c == 'u') {
if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV;
else if (*optarg == 'B') opt.flag |= MM_F_SPLICE_BOTH;
else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV;
else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR;
else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
else {
fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__);
return 1;
}
} else if (c == 'O') {
opt.q = opt.q2 = strtol(optarg, &s, 10);
if (*s == ',') opt.q2 = strtol(s + 1, &s, 10);
} else if (c == 'E') {
opt.e = opt.e2 = strtol(optarg, &s, 10);
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
} else if (c == 'I' || c == 'K') {
double x;
char *p;
x = strtod(optarg, &p);
if (*p == 'G' || *p == 'g') x *= 1e9;
else if (*p == 'M' || *p == 'm') x *= 1e6;
else if (*p == 'K' || *p == 'k') x *= 1e3;
if (c == 'I') batch_size = (uint64_t)(x + .499);
else minibatch_size = (uint64_t)(x + .499);
} else if (c == 'x') {
if (strcmp(optarg, "ava-ont") == 0) {
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
@ -139,6 +164,13 @@ int main(int argc, char *argv[])
opt.min_dp_max = 50;
opt.best_n = 10;
opt.bw = 100;
} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
k = 15, w = 5;
opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000;
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
opt.noncan = 5;
opt.zdrop = 200;
} else {
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
return 1;
@ -146,73 +178,87 @@ int main(int argc, char *argv[])
}
}
if (w < 0) w = (int)(.6666667 * k + .499);
if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
opt.max_gap_ref = opt.bw = max_intron_len;
if (argc == optind) {
fprintf(stderr, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " Indexing:\n");
fprintf(stderr, " -H use homopolymer-compressed k-mer\n");
fprintf(stderr, " -k INT k-mer size (no larger than 28) [%d]\n", k);
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(stderr, " -d FILE dump index to FILE []\n");
fprintf(stderr, " Mapping:\n");
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
fprintf(stderr, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(stderr, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
fprintf(stderr, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n");
fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
fprintf(stderr, " Alignment:\n");
fprintf(stderr, " -A INT matching score [%d]\n", opt.a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b);
fprintf(stderr, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -Q ignore base quality in the input\n");
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
fprintf(stderr, " -c output CIGAR in PAF\n");
fprintf(stderr, " -S output the cs tag in PAF\n");
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
fprintf(stderr, " -K NUM minibatch size [200M]\n");
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
fprintf(stderr, " -V show version number\n");
fprintf(stderr, " Preset:\n");
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
fprintf(stderr, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
fprintf(stderr, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
return 1;
if (argc == optind || fp_help == stdout) {
fprintf(fp_help, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
fprintf(fp_help, "Options:\n");
fprintf(fp_help, " Indexing:\n");
fprintf(fp_help, " -H use homopolymer-compressed k-mer\n");
fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k);
fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(fp_help, " -d FILE dump index to FILE []\n");
fprintf(fp_help, " Mapping:\n");
fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
fprintf(fp_help, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(fp_help, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
fprintf(fp_help, " -X skip self and dual mappings (for the all-vs-all mode)\n");
fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
fprintf(fp_help, " -G NUM max intron length (only effective following -x splice) [200k]\n");
fprintf(fp_help, " Alignment:\n");
fprintf(fp_help, " -A INT matching score [%d]\n", opt.a);
fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b);
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
fprintf(fp_help, " Input/Output:\n");
fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
fprintf(fp_help, " -Q don't output base quality in SAM\n");
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n");
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
fprintf(fp_help, " -K NUM minibatch size [200M]\n");
// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose);
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(fp_help, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
fprintf(fp_help, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
fprintf(fp_help, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
fprintf(fp_help, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
fprintf(fp_help, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
fprintf(fp_help, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
fprintf(fp_help, " splice: long-read spliced alignment (see minimap2.1 for details)\n");
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
return fp_help == stdout? 0 : 1;
}
is_idx = mm_idx_is_idx(argv[optind]);
if (is_idx < 0) {
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
return 1;
}
if (!is_idx && fnw == 0 && argc - optind < 2) {
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
return 1;
}
if (is_idx) fpr = fopen(argv[optind], "rb");
else fp = mm_bseq_open(argv[optind]);
if (fnw) fpw = fopen(fnw, "wb");
if (opt.flag & MM_F_OUT_SAM)
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
for (;;) {
mm_idx_t *mi = 0;
mm_idx_t *mi;
if (fpr) {
mi = mm_idx_load(fpr);
if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
} else if (!mm_bseq_eof(fp)) {
fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
} else {
mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
}
if (mi == 0) break;
++n_idx_part;
if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);

24
map.c
View File

@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->min_chain_score = 40;
opt->bw = 500;
opt->max_gap = 5000;
opt->max_gap_ref = -1;
opt->max_chain_skip = 25;
opt->mask_level = 0.5f;
@ -37,6 +38,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if (opt->flag & MM_F_SPLICE_BOTH)
opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac);
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
if (mm_verbose >= 3)
@ -167,7 +170,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
#endif
mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs)
{
int i, n = m_en - m_st, j, n_u;
int i, n = m_en - m_st, j, n_u, max_gap_ref;
int64_t n_a;
uint64_t *u;
mm_match_t *m;
@ -243,7 +246,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km);
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
*n_regs = n_u;
@ -256,7 +260,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
if (!(opt->flag & MM_F_SPLICE))
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) {
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
@ -348,8 +353,10 @@ static void *worker_pipeline(void *shared, int step, void *in)
mm_bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
puts(p->str.s);
}
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
@ -378,11 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
if (pl.fp == 0) return -1;
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
if (opt->flag & MM_F_OUT_SAM) {
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
mm_write_sam_SQ(idx);
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
free(pl.str.s);
mm_bseq_close(pl.fp);

View File

@ -5,15 +5,20 @@
#include <stdio.h>
#include <sys/types.h>
#define MM_IDX_DEF_B 14
#define MM_IDX_DEF_B 14
#define MM_F_NO_SELF 0x01
#define MM_F_AVA 0x02
#define MM_F_CIGAR 0x04
#define MM_F_OUT_SAM 0x08
#define MM_F_NO_QUAL 0x10
#define MM_F_OUT_CG 0x20
#define MM_F_OUT_CS 0x40
#define MM_F_NO_SELF 0x001
#define MM_F_AVA 0x002
#define MM_F_CIGAR 0x004
#define MM_F_OUT_SAM 0x008
#define MM_F_NO_QUAL 0x010
#define MM_F_OUT_CG 0x020
#define MM_F_OUT_CS 0x040
#define MM_F_SPLICE 0x080
#define MM_F_SPLICE_FOR 0x100
#define MM_F_SPLICE_REV 0x200
#define MM_F_SPLICE_BOTH 0x400
#define MM_F_NO_SAM_SQ 0x800
#define MM_IDX_MAGIC "MMI\2"
@ -55,7 +60,8 @@ typedef struct {
uint32_t capacity;
int32_t dp_score, dp_max, dp_max2;
uint32_t blen;
uint32_t n_diff, n_ambi;
uint32_t n_diff;
uint32_t n_ambi:30, trans_strand:2;
uint32_t n_cigar;
uint32_t cigar[];
} mm_extra_t;
@ -80,7 +86,7 @@ typedef struct {
int flag; // see MM_F_* macros
int bw; // bandwidth
int max_gap; // break a chain if there are no minimizers in a max_gap window
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_chain_skip;
int min_cnt;
int min_chain_score;
@ -93,6 +99,7 @@ typedef struct {
int min_join_flank_sc;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int noncan;
int zdrop;
int min_dp_max;
int min_ksw_len;

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "30 July 2017" "minimap2-2.0rc1-r232" "Bioinformatics tools"
.TH minimap2 1 "25 August 2017" "minimap2-2.1-r311" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when
.B -X
is applied.
.TP
.BI -G \ NUM
Maximal intron length in the splice mode [200k]. This option also changes the
bandwidth to
.IR NUM .
Increasing this option slows down spliced alignment.
.TP
.BI --max-chain-skip \ INT
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
for chaining. The time complexity is quadratic in the number of seeds. This
@ -199,15 +205,32 @@ the contiguity of the alignment at the cost of poor alignment in the middle
Minimal peak DP alignment score to output [40]. The peak score is computed from
the final CIGAR. It is the score of the max scoring segment in the alignment
and may be different from the total alignment score.
.TP
.BI -u \ CHAR
How to find canonical splicing sites GT-AG -
.BR f :
transcript strand;
.BR b :
both strands;
.BR n :
no attempt to match GT-AG [n]
.TP
.BI --cost-non-gt-ag \ INT
Cost of non-canonical splicing sites [0].
.SS Input/output options
.TP 10
.B -Q
Ignore base quality in the input file.
.TP
.B -a
Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
by default.
.TP
.B -Q
Ignore base quality in the input file.
.TP
.BI -R \ STR
SAM read group line in a format like
.B @RG\\\\tID:foo\\\\tSM:bar
[].
.TP
.B -c
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
.TP
@ -232,8 +255,13 @@ and
use
.BR -K500m .
.TP
.B -V
.B --version
Print version number to stdout
.TP
.B --no-sam-hdr
Don't output SAM header lines. Use this option if the index consists of
multiple parts; otherwise the SAM output is malformated due to internal header
lines.
.SS Preset options
.TP 10
.BI -x \ STR
@ -247,37 +275,64 @@ are:
.RS
.TP 8
.B map-pb
PacBio/Oxford Nanopore read to reference mapping (-Hk19)
PacBio/Oxford Nanopore read to reference mapping
.RB ( -Hk19 )
.TP
.B map10k
The same as
.B map-pb
(-Hk19)
.RB ( -Hk19 )
.TP
.B map-ont
Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For
PacBio reads, HPC minimizers consistently leads to faster performance and more
sensitive results in comparison to normal minimizers. For Oxford Nanopore data,
normal minimizers are better, though not much. The effectiveness of HPC is
determined by the sequencing error mode.
Slightly more sensitive for Oxford Nanopore to reference mapping
.RB ( -k15 ).
For PacBio reads, HPC minimizers consistently leads to faster performance and
more sensitive results in comparison to normal minimizers. For Oxford Nanopore
data, normal minimizers are better, though not much. The effectiveness of HPC
is determined by the sequencing error mode.
.TP
.B asm5
Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200).
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200
.BR -z200 ).
Typically, the alignment will not extend to regions with 5% or higher sequence
divergence. Only use this preset if the average divergence is far below 5%.
.TP
.B asm10
Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200). Up
to 10% sequence divergence.
.TP 8
Long assembly to reference mapping
.RB ( -k19
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200
.BR -z200 ).
Up to 10% sequence divergence.
.TP
.B ava-pb
PacBio all-vs-all overlap mapping (-Hk19 -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip 25)
.TP 8
PacBio all-vs-all overlap mapping
.RB ( -Hk19
.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
.BR 25 ).
.TP
.B ava-ont
Oxford Nanopore all-vs-all overlap mapping (-k15 -w5 -Xp0 -m100 -K500m -g10000
--max-chain-skip 25). Similarly, the major difference from
Oxford Nanopore all-vs-all overlap mapping
.RB ( -k15
.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
.BR 25 ).
Similarly, the major difference from
.B ava-pb
is that this preset is not using HPC minimizers.
.TP
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
.BR 5 ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
.RB ` N '
CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
costs are different during chaining; 4) the computation of the
.RB ` ms '
tag ignores introns to demote hits to pseudogenes.
.RE
.SS Miscellaneous options
.TP 10

266
misc/intron-eval.js 100644
View File

@ -0,0 +1,266 @@
/*******************************
* Command line option parsing *
*******************************/
var getopt = function(args, ostr) {
var oli; // option letter list index
if (typeof(getopt.place) == 'undefined')
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
if (getopt.place == -1) { // update scanning pointer
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
getopt.place = -1;
return null;
}
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
++getopt.ind;
getopt.place = -1;
return null;
}
}
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
if (getopt.place < 0) ++getopt.ind;
return '?';
}
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
getopt.arg = null;
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
} else { // need an argument
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
getopt.arg = args[getopt.ind].substr(getopt.place);
else if (args.length <= ++getopt.ind) { // no arg
getopt.place = -1;
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
return '?';
} else getopt.arg = args[getopt.ind]; // white space
getopt.place = -1;
++getopt.ind;
}
return optopt;
}
/***********************
* Interval operations *
***********************/
Interval = {};
Interval.sort = function(a)
{
if (typeof a[0] == 'number')
a.sort(function(x, y) { return x - y });
else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] });
}
Interval.merge = function(a, sorted)
{
if (typeof sorted == 'undefined') sorted = true;
if (!sorted) Interval.sort(a);
var k = 0;
for (var i = 1; i < a.length; ++i) {
if (a[k][1] >= a[i][0])
a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1];
else a[++k] = a[i].slice(0);
}
a.length = k + 1;
}
Interval.index_end = function(a, sorted)
{
if (a.length == 0) return;
if (typeof sorted == 'undefined') sorted = true;
if (!sorted) Interval.sort(a);
a[0].push(0);
var k = 0, k_en = a[0][1];
for (var i = 1; i < a.length; ++i) {
if (k_en <= a[i][0]) {
for (++k; k < i; ++k)
if (a[k][1] > a[i][0])
break;
k_en = a[k][1];
}
a[i].push(k);
}
}
Interval.find_intv = function(a, x)
{
var left = -1, right = a.length;
if (typeof a[0] == 'number') {
while (right - left > 1) {
var mid = left + ((right - left) >> 1);
if (a[mid] > x) right = mid;
else if (a[mid] < x) left = mid;
else return mid;
}
} else {
while (right - left > 1) {
var mid = left + ((right - left) >> 1);
if (a[mid][0] > x) right = mid;
else if (a[mid][0] < x) left = mid;
else return mid;
}
}
return left;
}
Interval.find_ovlp = function(a, st, en)
{
if (a.length == 0 || st >= en) return [];
var l = Interval.find_intv(a, st);
var k = l < 0? 0 : a[l][a[l].length - 1];
var b = [];
for (var i = k; i < a.length; ++i) {
if (a[i][0] >= en) break;
else if (st < a[i][1])
b.push(a[i]);
}
return b;
}
/*****************
* Main function *
*****************/
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false;
while ((c = getopt(arguments, "l:ep")) != null) {
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
else if (c == 'e') print_err_only = print_ovlp = true;
else if (c == 'p') print_ovlp = true;
}
if (arguments.length - getopt.ind < 2) {
print("Usage: k8 intron-eval.js [options] <gene.gtf> <aln.sam>");
exit(1);
}
var file, buf = new Bytes();
var tr = {};
file = new File(arguments[getopt.ind]);
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[0].charAt(0) == '#') continue;
if (t[2] != 'exon') continue;
var st = parseInt(t[3]) - 1;
var en = parseInt(t[4]);
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
var tid = m[1];
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
tr[tid][4].push([st, en]);
}
file.close();
var anno = {};
for (var tid in tr) {
var t = tr[tid];
Interval.sort(t[4]);
t[2] = t[4][0][0];
t[3] = t[4][t[4].length - 1][1];
if (anno[t[0]] == null) anno[t[0]] = [];
var s = t[4];
for (var i = 0; i < s.length - 1; ++i) {
if (s[i][1] >= s[i+1][0]) throw Error("ERROR: wrong annotation!");
anno[t[0]].push([s[i][1], s[i+1][0]]);
}
}
tr = null;
for (var chr in anno) {
var e = anno[chr];
if (e.length == 0) continue;
Interval.sort(e);
var k = 0;
for (var i = 1; i < e.length; ++i) // dedup
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
e[++k] = e[i].slice(0);
e.length = k + 1;
Interval.index_end(e);
}
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0;
file = new File(arguments[getopt.ind+1]);
var last_qname = null;
var re_cigar = /(\d+)([MIDNSHX=])/g;
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[0].charAt(0) == '@') continue;
var flag = parseInt(t[1]);
if (flag&0x100) continue;
if (first_only && last_qname == t[0]) continue;
if (t[2] == '*') {
++n_unmapped;
continue;
} else {
++n_pri;
if (last_qname != t[0]) ++n_mapped;
}
var pos = parseInt(t[3]) - 1, intron = [];
while ((m = re_cigar.exec(t[5])) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
intron.push([pos, pos + len]);
pos += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
}
if (intron.length == 0) {
++n_sgl;
continue;
}
n_splice += intron.length;
var chr = anno[t[2]];
if (chr != null) {
for (var i = 0; i < intron.length; ++i) {
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
if (o.length > 0) {
var hit = false;
for (var j = 0; j < o.length; ++j) {
var st_diff = intron[i][0] - o[j][0];
var en_diff = intron[i][1] - o[j][1];
if (st_diff < 0) st_diff = -st_diff;
if (en_diff < 0) en_diff = -en_diff;
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
++n_splice_hit, hit = true;
if (hit) break;
}
if (print_ovlp) {
var type = hit? 'C' : 'P';
if (hit && print_err_only) continue;
var x = '[';
for (var j = 0; j < o.length; ++j) {
if (j) x += ', ';
x += '(' + o[j][0] + "," + o[j][1] + ')';
}
x += ']';
print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
}
} else {
++n_splice_novel;
if (print_ovlp)
print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
}
}
} else {
n_splice_novel += intron.length;
}
last_qname = t[0];
}
file.close();
buf.destroy();
if (!print_ovlp) {
print("# unmapped reads: " + n_unmapped);
print("# mapped reads: " + n_mapped);
print("# primary alignments: " + n_pri);
print("# singletons: " + n_sgl);
print("# predicted introns: " + n_splice);
print("# non-overlapping introns: " + n_splice_novel);
print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)");
}

View File

@ -94,7 +94,7 @@ while (file.readline(buf) >= 0) {
ori_qlen = parseInt(t[1]);
} else { // SAM
var flag = parseInt(t[1]);
if (flag & 4) continue;
if ((flag & 4) || t[2] == '*' || t[5] == '*') continue;
if (flag & 0x100) {
++n_2nd;
continue;

View File

@ -40,9 +40,11 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_write_sam_SQ(const mm_idx_t *idx);
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);

2
test/q2.fa 100644
View File

@ -0,0 +1,2 @@
>q2
GGACATCCCGATGGTGCAGTCCTACCTGTACGAAAGGAC

2
test/t2.fa 100644
View File

@ -0,0 +1,2 @@
>t2
GGACATCCCGATGGTGCAGgtGCTATTAAAGGTTCGTTTGTTCAACGATTAAagTCCTACCTGTACGAAAGGAC

View File

@ -179,3 +179,83 @@
note = {doi:10.1101/130633},
publisher = {Cold Spring Harbor Labs Journals},
journal = {bioRxiv}}
@article{Gotoh:1982aa,
Author = {Gotoh, O},
Journal = {J Mol Biol},
Pages = {705-8},
Title = {An improved algorithm for matching biological sequences},
Volume = {162},
Year = {1982}}
@article{Altschul:1986aa,
Author = {Altschul, S F and Erickson, B W},
Journal = {Bull Math Biol},
Pages = {603-16},
Title = {Optimal sequence alignment using affine gap costs},
Volume = {48},
Year = {1986}}
@article{Wu:2005vn,
Author = {Wu, Thomas D and Watanabe, Colin K},
Journal = {Bioinformatics},
Pages = {1859-75},
Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences},
Volume = {21},
Year = {2005}}
@article{Iwata:2012aa,
Author = {Iwata, Hiroaki and Gotoh, Osamu},
Journal = {Nucleic Acids Res},
Pages = {e161},
Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features},
Volume = {40},
Year = {2012}}
@article{Dobin:2013kx,
Author = {Dobin, Alexander and others},
Journal = {Bioinformatics},
Pages = {15-21},
Title = {{STAR}: ultrafast universal {RNA-seq} aligner},
Volume = {29},
Year = {2013}}
@article{Byrne:2017aa,
Author = {Byrne, Ashley and others},
Journal = {Nat Commun},
Pages = {16027},
Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells},
Volume = {8},
Year = {2017}}
@article{Roberts:2004fv,
Author = {Roberts, Michael and others},
Journal = {Bioinformatics},
Pages = {3363-9},
Title = {Reducing storage requirements for biological sequence comparison},
Volume = {20},
Year = {2004}}
@article{Zhang:2006aa,
Author = {Zhang, Miao and Gish, Warren},
Journal = {Bioinformatics},
Pages = {13-20},
Title = {Improved spliced alignment from an information theoretic approach},
Volume = {22},
Year = {2006}}
@article{Li:2007aa,
Author = {Li, Heng and others},
Journal = {BMC Bioinformatics},
Pages = {349},
Title = {A cross-species alignment tool {(CAT)}},
Volume = {8},
Year = {2007}}
@article{Farrar:2007hs,
Author = {Farrar, Michael},
Journal = {Bioinformatics},
Pages = {156-61},
Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}},
Volume = {23},
Year = {2007}}

View File

@ -20,7 +20,7 @@
\begin{document}
\firstpage{1}
\title[Long DNA sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long DNA sequences}
\title[Aligning long nucleotide sequences with minimap2]{Minimap2: fast pairwise alignment for long nucleotide sequences}
\author[Li]{Heng Li}
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
@ -28,11 +28,14 @@
\begin{abstract}
\section{Summary:} Minimap2 is a general-purpose mapper to align long noisy DNA
sequences against a large reference database. It targets query sequences of
1kb--100Mb in length with per-base divergence typically below 25\%. Minimap2 is
$\sim$30 times faster than many mainstream long-read aligners and achieves
higher accuracy on simulated data. It also employs concave gap cost and rescues
inversions for improved alignment around potential structural variations.
or mRNA sequences against a large reference database. It targets query
sequences of 1kb--100Mb in length with per-base divergence typically below
25\%. For DNA sequence reads, minimap2 is $\sim$30 times faster than many
mainstream long-read aligners and achieves higher accuracy on simulated data.
It also employs concave gap cost and rescues inversions for improved alignment
around potential structural variations. For real long RNA-seq reads, minimap2
is $\sim$40 times faster than peers and produces alignment more consistent with
existing gene annotations.
\section{Availability and implementation:}
\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2}
@ -46,43 +49,62 @@ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore
technologies (ONT) produce reads over 10kbp in length at an error rate
$\sim$15\%. Several aligners have been developed for such
data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}.
They are usually five times as slow as mainstream short-read
aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be
substantial room for speedup on the thought that 10kb long sequences should be
easier to map than 100bp reads because we can more effectively skip repetitive
regions, which are often the bottleneck of short-read alignment. We confirmed
our speculation by achieving approximate mapping 50 times faster than
BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast
and novel algorithm on generating detailed alignment, which in turn inspired us
to develop minimap2 towards higher accuracy and more practical functionality.
Most of them were five times as slow as mainstream short-read
aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases
mapped per second. We speculated there could be substantial room for speedup on
the thought that 10kb long sequences should be easier to map than 100bp reads
because we can more effectively skip repetitive regions, which are often the
bottleneck of short-read alignment. We confirmed our speculation by achieving
approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
\citet{Suzuki:2016} extended our work with a fast and novel algorithm on
generating base-level alignment, which in turn inspired us to develop minimap2
towards higher accuracy and more practical functionality.
Both SMRT and ONT have been applied to sequence spliced mRNAs (RNA-seq). While
traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not
optimized for long noisy sequence reads and are tens of times slower than
dedicated long-read aligners. When developing minimap2 initially for aligning
genomic DNA only, we realized minor modifications could make it competitive for
aligning mRNAs as well. Minimap2 is a first RNA-seq aligner specifically
designed for long noisy reads.
\begin{methods}
\section{Methods}
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
indexing and seeding algorithms, and furthers it with more accurate chaining
and the ability to produce detailed alignment.
Minimap2 follows a typical seed-chain-align procedure as is used by most
full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
reference sequences and indexes them in a hash table. Then for each query
sequence, minimap2 takes query minimizers as \emph{seeds}, finds matches to the
reference, and identifies sets of colinear seeds, which are called
\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
programming (DP) to extend from the ends of chains and to close unseeded
regions between adjacent seeds in chains.
Minimap2 uses indexing and seeding algorithms similar to
minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate
chaining, the ability to produce base-level alignment and the support of
spliced alignment.
\subsection{Chaining}
%\subsubsection{Chaining}
\subsubsection{Chaining}
An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the
reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors
sorted by ending reference position $x$, let $f(i)$ be the maximal chaining
score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
dynamic programming (DP):
dynamic programming:
\begin{equation}\label{eq:chain}
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\}
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\}
\end{equation}
where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It
where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It
equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
distance between two anchors is too large); otherwise
\[
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
\]
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
\begin{equation}\label{eq:chain-gap}
\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
\end{equation}
In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot
|l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
are inapplicable to generic gap cost, complex to implement and usually
@ -91,20 +113,20 @@ accelerate chaining.
We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor
of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
we start from anchor $i-1$ and stop the evaluation if we cannot find a better
we start from anchor $i-1$ and stop the process if we cannot find a better
score after up to $h$ iterations. This approach reduces the average time to
$O(h\cdot m)$. In practice, we can almost always find the optimal chain with
$h=50$; even if the heuristic fails, the optimal chain is often close.
%\subsubsection{Backtracking}
For backtracking, let $P(i)$ be the index of the best predecessor of anchor
$i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$
otherwise. For each anchor $i$ in the descending order of $f(i)$, we apply
$P(\cdot)$ repeatedly to find its predecessor and mark each visited $i$ as
`used', until $P(i)=0$ or we reach an already `used' $i$. This way we find all
chains with no anchors used in more than one chains.
\subsubsection{Backtracking}
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
$f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each
anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to
find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we
reach an already `used' $i$. This way we find all chains with no anchors used
in more than one chains.
%\subsubsection{Identifying primary chains}
\subsubsection{Identifying primary chains}
In the absence of copy number changes, each query segment should not be mapped
to two places in the reference. However, chains found at the previous step may
have significant or complete overlaps due to repeats in the reference.
@ -117,132 +139,22 @@ add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did
not choose a more sophisticated data structure (e.g. range tree or k-d tree)
because this step is not the performance bottleneck.
\subsection{Alignment}
\subsection{Aligning genomic DNA}
Minimap2 performs global alignment between adjacent anchors in a chain. It
adopted difference-based formulation to derive
alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization,
this formulation has two advantages. First, because each score in the DP matrix
is bounded by the gap cost and the maximum matching score, we can usually
achieve 16-way SSE vectorization regardless of the peak score of the
alignment. Second, filling the DP matrix along the diagonal, we can simplify
banded alignment, which is critical to performance. In practice, our
implementation is three times as fast as Parasail's 4-way
vectorization~\citep{Daily:2016aa} for global alignment.
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
but with a 1000bp band, it is considerably faster. When performing global
alignment between anchors, we expect the alignment to stay close to the
diagonal of the DP matrix. Banding is often applicable.
\subsubsection{Alignment with 2-piece affine gap cost}
Minimap2 uses a 2-piece affine gap cost
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than
$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}).
Minimap2 performs DP-based global alignment between adjacent anchors in a
chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
\begin{equation}\label{eq:2-piece}
\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\}
\end{equation}
Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$.
On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps
shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
longer insertions and deletions~(INDELs).
With global alignment, minimap2 may force to align unrelated sequences between
two adjacent anchors. To avoid such an artifact, we compute accumulative
alignment score along the alignment path and break the alignment where the
score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
the alignment score along the alignment path ending at cell $(i,j)$ in the DP
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
$j'<j$, such that
\[
S(i',j')-S(i,j)>Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\})
\]
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
However, unlike X-drop, it would not break the alignment in the presence of a
single long gap.
When minimap2 breaks a global alignment between two anchors, it performs local
alignment between the two subsequences involved in the global alignment, but
this time with the one subsequence reverse complemented. This additional
alignment step may identify short inversions that are missed during chaining.
\end{methods}
\section{Results}
\begin{figure}[!tb]
\centering
\includegraphics[width=.5\textwidth]{roc-color.pdf}
\caption{Evaluation on simulated SMRT reads aligned against human genome
GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
of mapping quality. 33,088 $\ge$1000bp reads were simulated using
pbsim~\citep{Ono:2013aa} with error profile sampled from file
`m131017\_060208\_42213\_*.1.*' downloaded at
\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
11,628. A read is considered correctly mapped if the true position overlaps
with the best mapping position by 10\% of the read length. All aligners were
run under the default setting for SMRT reads.}\label{fig:eval}
\end{figure}
As a sanity check, we evaluated minimap2 on simulated human reads along with
BLASR~\citep{Chaisson:2012aa},
BWA-MEM~\citep{Li:2013aa},
GraphMap~\citep{Sovic:2016aa},
minialign~\citep{Suzuki:2016} and
NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
crashed or produced malformatted output. In this evaluation, Minimap2 has
higher power to distinguish unique and repetitive hits, and achieves overall
higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
even if we skip DP-based alignment (data not shown), suggesting chaining alone
is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
NGMLR provide better mapping quality estimate: they rarely give repetitive hits
high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may
occasionally miss close suboptimal hits and be overconfident in wrong mappings.
On run time, minialign is slightly faster than minimap2. They are over 30 times
faster than the rest. Minimap2 consumed 6.1GB memory at the peak, more than
BWA-MEM but less than others.
On real human SMRT reads, the relative performance and sensitivity of
these aligners are broadly similar to those on simulated data. We are unable to
provide a good estimate of mapping error rate due to the lack of the truth. On
ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
minimap2 are over 70 times faster than others. We have also examined tens of
$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the
observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into
shorter gaps. Minimap2 does not have this issue.
\section{Discussions}
Minialign and minimap2 are fast because a) with chaining, they can quickly
filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but
costly DP-based alignments; b) they implemented so far the fastest DP-based
alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to
further accelerate minimap2 with a few other tweaks such as adaptive
banding~\citep{Suzuki130633} or incremental banding.
In addition to reference-based read mapping, minimap2 inherits minimap's
ability to search against huge multi-species databases and to find read
overlaps. On a few test data sets, minimap2 appears to yield slightly better
miniasm assembly. Minimap2 can also align closely related genomes, though it
would benefit from more thorough evaluations. Genome alignment is an intricate
topic.
\section*{Acknowledgements}
We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also
grateful to early minimap2 testers who have greatly helped to fix various
issues.
\bibliography{minimap2}
\pagebreak
\appendix
\begin{methods}
\section*{Appendix}
A 2-piece gap cost function is
\[
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
\]
Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
to compute the optimal alignment under such a gap cost is~\citep{Gotoh:1990aa}
The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is
\begin{equation}\label{eq:ae86}
\left\{\begin{array}{l}
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
@ -253,7 +165,20 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
\end{array}\right.
\end{equation}
where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
base. If we define
base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine
gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}.
\subsubsection{Suzuki's formulation}
When we allow gaps longer than several hundred base pairs, nucleotide-level
alignment is much slower than chaining. SSE acceleration is critical to the
performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
sequences, but only 4-way parallelization when the peak alignment score reaches
32767. Long sequence alignment may exceed this threshold. Inspired by
\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a
difference-based formulation that lifted this limitation.
In case of 2-piece gap cost, define
\[
\left\{\begin{array}{ll}
u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
@ -261,7 +186,7 @@ x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\
y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
\end{array}\right.
\]
we can transform Eq.~(\ref{eq:ae86}) to
We can transform Eq.~(\ref{eq:ae86}) to
\begin{equation}\label{eq:suzuki}
\left\{\begin{array}{lll}
z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
@ -276,7 +201,8 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
\end{equation}
where $z_{ij}$ is a temporary variable that does not need to be stored.
All values in Eq.~(\ref{eq:suzuki}) are bounded. To see that,
An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded
by scoring parameters. To see that,
\[
x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
\]
@ -294,19 +220,19 @@ matching score, we can derive
\[
u_{ij}\le M-v_{i-1,j}\le M+q+e
\]
In conclusion, $x$ and $y$ are bounded by $[-q-e,-e]$, $\tilde{x}$ and $\tilde{y}$ by
$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
matching score and gap cost are small, each of them can be stored as a 8-bit
integer. This enables 16-way SSE vectorization regardless of the peak score of
the alignment.
In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$,
$\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and
$v$ by $[-q-e,M+q+e]$. When $-128\le-q-e<M+q+e\le127$, each of them can be stored as
a 8-bit integer. This enables 16-way SSE vectorization regardless of the peak
score of the alignment.
For a more efficient SSE implementation, we transform the row-column coordinate
to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
to the diagonal-antidiagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
Eq.~(\ref{eq:suzuki}) becomes:
\begin{equation*}
\left\{\begin{array}{lll}
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}\\
&&+u_{r-1,t},\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
v_{rt}&=&z_{rt}-u_{r-1,t}\\
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
@ -317,10 +243,11 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
\end{equation*}
In this formulation, cells with the same diagonal index $r$ are independent of
each other. This allows us to fully vectorize the computation of all cells on
the same anti-diagonal in one inner loop.
the same anti-diagonal in one inner loop. It also simplifies banded alignment,
which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
condition of this equation in the diagonal-anti-diagonal coordinate is
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
values in the diagonal-antidiagonal formuation is
\[
\left\{\begin{array}{l}
x_{r-1,-1}=y_{r-1,r}=-q-e\\
@ -337,8 +264,246 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til
-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
\end{array}\right.
\]
These can be derived from the initial values for Eq.~(\ref{eq:ae86}).
In practice, our 16-way vectorized implementation of global alignment is three
times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without
banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
a 1000bp band, it is considerably faster. When performing global alignment
between anchors, we expect the alignment to stay close to the diagonal of the
DP matrix. Banding is applicable most of time.
\subsubsection{The Z-drop heuristic}
With global alignment, minimap2 may force to align unrelated sequences between
two adjacent anchors. To avoid such an artifact, we compute accumulative
alignment score along the alignment path and break the alignment where the
score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
the alignment score along the alignment path ending at cell $(i,j)$ in the DP
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
$j'<j$, such that
\[
S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')|
\]
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
This strategy is first used in BWA-MEM. It is similar to X-drop employed in
BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the
alignment in the presence of a single long gap.
When minimap2 breaks a global alignment between two anchors, it performs local
alignment between the two subsequences involved in the global alignment, but
this time with the one subsequence reverse complemented. This additional
alignment step may identify short inversions that are missed during chaining.
\subsection{Aligning spliced sequences}
The algorithm described above can be adapted to spliced alignment. In this
mode, the chaining gap cost distinguishes insertions to and deletions from the
reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of
\[
\gamma_c(l)=\left\{\begin{array}{ll}
0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\
\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0)
\end{array}\right.
\]
Similarly, the gap cost function used for DP-based alignment is changed to
\[
\gamma_a(l)=\left\{\begin{array}{ll}
q+l\cdot e & (l>0) \\
\min\{q+|l|\cdot e,\tilde{q}\} & (l<0)
\end{array}\right.
\]
In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is
regarded as an intron, which pays no cost to gap extensions.
To pinpoint precise splicing junctions, minimap2 introduces reference-dependent
cost to penalize non-canonical splicing:
\begin{equation}\label{eq:splice}
\left\{\begin{array}{l}
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\
E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
\tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\
\end{array}\right.
\end{equation}
Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor
site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$
otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which
takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is
almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except
that we allow insertions immediately followed by deletions and vice versa; in
addition, we use Suzuki's diagonal formulation in actual implementation.
%Given that $d_i$ and $a_i$
%are a function of the reference sequence, it is possible to incorporate
%splicing signals with more sophisticated models, such as positional weight
%matrices. We have not tried this approach.
If RNA-seq reads are not sequenced from stranded libraries, the read strand
relative to the underlying transcript is unknown. By default, minimap2 aligns
each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal
and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt
GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is
taken as the final alignment. This procedure also infers the relative strand of
reads that span canonical splicing sites.
In the spliced alignment mode, minimap2 further increases the density of
minimizers and disables banded alignment. Together with the two-round DP-based
alignment, spliced alignment is several times slower than DNA sequence
alignment.
\citet{Suzuki:2016} first derived a similar set of equations under affine gap
cost but with different notations.
\end{methods}
\section{Results}
\subsection{Aligning genomic reads}
\begin{figure}[!tb]
\centering
\includegraphics[width=.5\textwidth]{roc-color.pdf}
\caption{Evaluation on simulated SMRT reads aligned against human genome
GRCh38. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa}
with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded
at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length
is 11,628. A read is considered correctly mapped if the true position overlaps
with the best mapping position by 10\% of the read length. All aligners were
run under the default setting for SMRT reads. (a) ROC-like curve. Alignments
are sorted by mapping quality in the descending order. For each mapping quality
threshold, the fraction of alignments with mapping quality above the threshold
and their error rate are plotted. Kart outputted all alignments at mapping
quality 60, so is not shown in the figure. It mapped nearly all reads with
4.1\% of alignments being wrong, less accurate than others. (b) Accumulative
mapping error rate as a function of mapping quality.}\label{fig:eval}
\end{figure}
As a sanity check, we evaluated minimap2 on simulated human reads along with
BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}),
BWA-MEM~(v0.7.15; \citealp{Li:2013aa}),
GraphMap~(v0.5.2; \citealp{Sovic:2016aa}),
Kart~(v2.2.5; \citealp{Lin:2017aa}),
minialign~(v0.5.3; \citealp{Suzuki:2016}) and
NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab}
and LAMSA~\citep{Liu:2017aa} because they either
crashed or produced malformatted output. In this evaluation, minimap2 has
higher power to distinguish unique and repetitive hits, and achieves overall
higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
even if we skip DP-based alignment (data not shown), confirming chaining alone
is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
NGMLR provide better mapping quality estimate: they rarely give repetitive hits
high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may
occasionally miss close suboptimal hits and be overconfident in wrong mappings.
On run time, minialign is slightly faster than minimap2 and Kart. They are over
30 times faster than the rest. Minimap2 consumed 6.1GB memory at the peak,
more than BWA-MEM but less than others.
On real human SMRT reads, the relative performance and sensitivity of
these aligners are broadly similar to the metrics on simulated data. We are
unable to provide a good estimate of mapping error rate due to the lack of the
truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed.
Kart, minialign and minimap2 are over 70 times faster than others. We have also
examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can
confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks
them into shorter gaps. The issue is much alleviated with minimap2, thanks
to the 2-piece affine gap cost.
\subsection{Aligning spliced reads}
We evaluated minimap2 on SIRV control data~(AC:SRR5286959;
\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916
introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined
wrongly predicted junctions and found the majority were caused by clustered
splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing
errors are frequent, it is difficult to find precise splicing sites in this
case. If we allow up to 10bp distance from true splicing sites, 98.4\% of
aligned introns are approximately correct. Given this observation, we might be
able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in
Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more
sophisticated models. We have not tried this approach.
\begin{table}[!tb]
\processtable{Evaluation of junction accuracy on 2D ONT reads}
{\footnotesize\label{tab:intron}
\begin{tabular}{p{3.1cm}rrrr}
\toprule
& GMAP & minimap2 & SpAln & STAR\\
\midrule
Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\
Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\
\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\
\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\
\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\
\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\
\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\
\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\
\botrule
\end{tabular}
}{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse
genome GRCm38 with the following tools and command options: minimap2 (`-ax
splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
-S3'); STARlong (according to
\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were
compared to the EnsEMBL gene annotation, release 89. A predicted intron
is \emph{novel} if it has no overlaps with any annotated introns. An intron
is \emph{exact} if it is identical to an annotated intron. An intron is
\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
of an annotated intron.}
\end{table}
We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
\citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and
STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more
consistent with existing annotations (Table~\ref{tab:intron}): it finds
more junctions with a higher percentage being exactly or approximately correct.
Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to
minimap2 in speed, it does not work well with noisy reads. We have also
evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain
from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation
is similar: minimap2 is faster at higher junction accuracy.
We noted that GMAP and SpAln have not been optimized for noisy reads. We are
showing the best setting we have experimented, but their developers should be
able to improve their accuracy further.
%\begin{table}[!tb]
%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads}
%{\footnotesize
%\begin{tabular}{lrrrr}
%\toprule
%& GMAP & minimap2 & SpAln & STAR\\
%\midrule
%Run time (CPU min) & & 243 & 2\,352 & 1\,647 \\
%\# aligned reads & & 1\,123\,025 & 1\,094\,092 & 682\,452\\
%\# chimeric alignments & & 33\,091 & 0 & 0\\
%\# non-spliced alignments & & 339\,081 & 291\,447 & 272\,536\vspace{1em}\\
%\# aligned introns & & 9\,071\,755 & 9\,208\,564 & 3\,029\,121 \\
%\# novel introns & & 42\,773 & 82\,230 & 17\,791 \\
%\% exact introns & & 94.9\% & 91.7\% & 84.7\% \\
%\% approx. introns&& 96.9\% & 93.4\% & 93.8\% \\
%\botrule
%\end{tabular}
%}{}
%\end{table}
\section{Conclusion}
Minimap2 is a fast, accurate and versatile aligner for long nucleotide
sequences. In addition to reference-based read mapping, minimap2 inherits
minimap's functionality to search against huge multi-species databases and to
find read overlaps. On a few test data sets, minimap2 appears to yield slightly
better miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align similar
genomes or different assemblies of the same species. However, full-genome
alignment is an intricate research topic. More thorough evaluations would be
necessary to justify the use of minimap2 for such applications.
\section*{Acknowledgements}
We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also
grateful to early minimap2 testers who have greatly helped to suggest features
and to fix various issues.
\bibliography{minimap2}
\end{document}