Merge branch 'master' into short
This commit is contained in:
commit
c4080aaf7e
7
Makefile
7
Makefile
|
|
@ -2,8 +2,8 @@ CC= gcc
|
|||
CFLAGS= -g -Wall -O2 -Wc++-compat
|
||||
CPPFLAGS= -DHAVE_KALLOC
|
||||
INCLUDES= -I.
|
||||
OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_ll_sse.o misc.o bseq.o \
|
||||
sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
|
||||
OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ksw2_ll_sse.o \
|
||||
misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o
|
||||
PROG= minimap2
|
||||
PROG_EXTRA= sdust minimap2-lite
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
|
@ -45,11 +45,12 @@ align.o: minimap.h mmpriv.h bseq.h ksw2.h kalloc.h
|
|||
bseq.o: bseq.h kseq.h
|
||||
chain.o: minimap.h mmpriv.h bseq.h kalloc.h
|
||||
example.o: minimap.h kseq.h
|
||||
format.o: mmpriv.h minimap.h bseq.h
|
||||
format.o: kalloc.h mmpriv.h minimap.h bseq.h
|
||||
hit.o: mmpriv.h minimap.h bseq.h kalloc.h
|
||||
index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
|
||||
kalloc.o: kalloc.h
|
||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_exts2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||
main.o: bseq.h minimap.h mmpriv.h
|
||||
|
|
|
|||
38
NEWS.md
38
NEWS.md
|
|
@ -1,3 +1,41 @@
|
|||
Release 2.1-r311 (25 August 2017)
|
||||
---------------------------------
|
||||
|
||||
This release adds spliced alignment for long noisy RNA-seq reads. On a SMRT
|
||||
Iso-Seq and a Oxford Nanopore data sets, minimap2 appears to outperform
|
||||
traditional mRNA aligners. For DNA alignment, this release gives almost
|
||||
identical output to v2.0. Other changes include:
|
||||
|
||||
* Added option `-R` to set the read group header line in SAM.
|
||||
|
||||
* Optionally output the `cs:Z` tag in PAF to encode both the query and the
|
||||
reference sequences in the alignment.
|
||||
|
||||
* Fixed an issue where DP alignment uses excessive memory.
|
||||
|
||||
The minimap2 technical report has been updated with more details and the
|
||||
evaluation of spliced alignment:
|
||||
|
||||
* Li, H. (2017). Minimap2: fast pairwise alignment for long nucleotide
|
||||
sequences. [arXiv:1708.01492v2](https://arxiv.org/abs/1708.01492v2).
|
||||
|
||||
(2.1: 25 August 2017, r311)
|
||||
|
||||
|
||||
|
||||
Release 2.0-r275 (8 August 2017)
|
||||
--------------------------------
|
||||
|
||||
This release is identical to version 2.0rc1, except the version number. It is
|
||||
described and evaluated in the following technical report:
|
||||
|
||||
* Li, H. (2017). Minimap2: fast pairwise alignment for long DNA sequences.
|
||||
[arXiv:1708.01492v1](https://arxiv.org/abs/1708.01492v1).
|
||||
|
||||
(2.0: 8 August 2017, r275)
|
||||
|
||||
|
||||
|
||||
Release 2.0rc1-r232 (30 July 2017)
|
||||
----------------------------------
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
[](https://travis-ci.org/lh3/minimap2)
|
||||
## Getting Started
|
||||
```sh
|
||||
git clone https://github.com/lh3/minimap2
|
||||
|
|
@ -9,6 +10,8 @@ cd minimap2 && make
|
|||
./minimap2 -ax map10k MT-human.mmi test/MT-orang.fa > test.sam
|
||||
# long-read overlap (no test data)
|
||||
./minimap2 -x ava-pb your-reads.fa your-reads.fa > overlaps.paf
|
||||
# spliced alignment (no test data)
|
||||
./minimap2 -ax splice ref.fa rna-seq-reads.fa > spliced.sam
|
||||
# man page
|
||||
man ./minimap2.1
|
||||
```
|
||||
|
|
|
|||
74
align.c
74
align.c
|
|
@ -53,7 +53,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
} else if (op == 1) {
|
||||
score -= q + e * len, j += len;
|
||||
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
|
||||
} else if (op == 2) {
|
||||
} else if (op == 2 || op == 3) {
|
||||
score -= q + e * len, i += len;
|
||||
if (test_zdrop_aux(score, i, j, &max, &max_i, &max_j, e, zdrop)) return 1;
|
||||
}
|
||||
|
|
@ -64,11 +64,11 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
|
||||
{
|
||||
uint32_t k, l, toff = 0, qoff = 0;
|
||||
int32_t s = 0, max = 0;
|
||||
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
|
||||
if (p == 0) return;
|
||||
for (k = 0; k < p->n_cigar; ++k) {
|
||||
uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4;
|
||||
if (op == 0) {
|
||||
if (op == 0) { // match/mismatch
|
||||
for (l = 0; l < len; ++l) {
|
||||
int cq = qseq[qoff + l], ct = tseq[toff + l];
|
||||
if (ct > 3 || cq > 3) ++p->n_ambi;
|
||||
|
|
@ -78,7 +78,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
|
|||
else max = max > s? max : s;
|
||||
}
|
||||
toff += len, qoff += len, p->blen += len;
|
||||
} else if (op == 1) {
|
||||
} else if (op == 1) { // insertion
|
||||
int n_ambi = 0;
|
||||
for (l = 0; l < len; ++l)
|
||||
if (qseq[qoff + l] > 3) ++n_ambi;
|
||||
|
|
@ -86,7 +86,7 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
|
|||
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
|
||||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
} else if (op == 2) {
|
||||
} else if (op == 2) { // deletion
|
||||
int n_ambi = 0;
|
||||
for (l = 0; l < len; ++l)
|
||||
if (tseq[toff + l] > 3) ++n_ambi;
|
||||
|
|
@ -94,9 +94,18 @@ static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *t
|
|||
p->n_ambi += n_ambi, p->n_diff += len - n_ambi;
|
||||
s -= q + e * len;
|
||||
if (s < 0) s = 0;
|
||||
} else if (op == 3) { // intron
|
||||
uint8_t b[4];
|
||||
b[0] = tseq[toff], b[1] = tseq[toff+1];
|
||||
b[2] = tseq[toff+len-2], b[3] = tseq[toff+len-1];
|
||||
if (memcmp(b, "\2\3\0\2", 4) == 0) ++n_gtag;
|
||||
else if (memcmp(b, "\1\3\0\1", 4) == 0) ++n_ctac;
|
||||
toff += len, p->blen += len;
|
||||
}
|
||||
}
|
||||
p->dp_max = max;
|
||||
if (n_gtag > n_ctac) p->trans_strand = 1;
|
||||
else if (n_gtag < n_ctac) p->trans_strand = 2;
|
||||
}
|
||||
|
||||
static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // TODO: this calls the libc realloc()
|
||||
|
|
@ -132,7 +141,9 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
|
|||
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
|
||||
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
|
||||
}
|
||||
if (opt->q == opt->q2 && opt->e == opt->e2)
|
||||
if (opt->flag & MM_F_SPLICE)
|
||||
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez);
|
||||
else if (opt->q == opt->q2 && opt->e == opt->e2)
|
||||
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
|
||||
else
|
||||
ksw_extd2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->e2, w, opt->zdrop, flag, ez);
|
||||
|
|
@ -159,8 +170,8 @@ static inline void mm_adjust_minier(const mm_idx_t *mi, uint8_t *const qseq0[2],
|
|||
c = mm_get_hplen_back(mi, a->x<<1>>33, (int32_t)a->x);
|
||||
*r = (int32_t)a->x + 1 - c;
|
||||
} else {
|
||||
*r = (int32_t)a->x + 1;
|
||||
*q = (int32_t)a->y + 1;
|
||||
*r = (int32_t)a->x - (mi->k>>1);
|
||||
*q = (int32_t)a->y - (mi->k>>1);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -240,11 +251,11 @@ static void mm_fix_bad_ends(const mm_reg1_t *r, const mm128_t *a, int bw, int32_
|
|||
}
|
||||
}
|
||||
|
||||
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez)
|
||||
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, mm128_t *a, ksw_extz_t *ez, int splice_flag)
|
||||
{
|
||||
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
|
||||
uint8_t *tseq, *qseq;
|
||||
int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0;
|
||||
int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
|
||||
int32_t rs, re, qs, qe;
|
||||
int32_t rs1, qs1, re1, qe1;
|
||||
int8_t mat[25];
|
||||
|
|
@ -254,11 +265,19 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
bw = (int)(opt->bw * 1.5 + 1.);
|
||||
|
||||
r2->cnt = 0;
|
||||
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
|
||||
if (!(opt->flag & MM_F_SPLICE))
|
||||
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
|
||||
else as1 = r->as, cnt1 = r->cnt;
|
||||
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
|
||||
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
|
||||
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
|
||||
|
||||
if (opt->flag & MM_F_SPLICE) {
|
||||
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
|
||||
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
|
||||
if (splice_flag & MM_F_SPLICE_BOTH) extra_flag |= KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV;
|
||||
}
|
||||
|
||||
// compute rs0 and qs0
|
||||
if (r->split && as1 > 0) {
|
||||
mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0);
|
||||
|
|
@ -291,7 +310,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
mm_idx_getseq(mi, rid, rs0, rs, tseq);
|
||||
mm_seq_rev(qs - qs0, qseq);
|
||||
mm_seq_rev(rs - rs0, tseq);
|
||||
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
|
||||
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
|
||||
if (ez->n_cigar > 0) {
|
||||
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
||||
r->p->dp_score += ez->max;
|
||||
|
|
@ -313,9 +332,9 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
bw1 = qe - qs > re - rs? qe - qs : re - rs;
|
||||
qseq = &qseq0[rev][qs];
|
||||
mm_idx_getseq(mi, rid, rs, re, tseq);
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez);
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
|
||||
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
|
||||
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
|
||||
if (ez->n_cigar > 0)
|
||||
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
||||
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
|
||||
|
|
@ -338,7 +357,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (!dropped && qe < qe0 && re < re0) { // right extension
|
||||
qseq = &qseq0[rev][qe];
|
||||
mm_idx_getseq(mi, rid, re, re0, tseq);
|
||||
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez);
|
||||
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
|
||||
if (ez->n_cigar > 0) {
|
||||
mm_append_cigar(r, ez->n_cigar, ez->cigar);
|
||||
r->p->dp_score += ez->max;
|
||||
|
|
@ -356,6 +375,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (r->p) {
|
||||
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
||||
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
|
||||
if (rev && r->p->trans_strand)
|
||||
r->p->trans_strand ^= 3; // flip to the read strand
|
||||
}
|
||||
|
||||
kfree(km, tseq);
|
||||
|
|
@ -438,7 +459,28 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
|
|||
memset(&ez, 0, sizeof(ksw_extz_t));
|
||||
for (i = 0; i < n_regs; ++i) {
|
||||
mm_reg1_t r2;
|
||||
mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, a, &ez);
|
||||
if ((opt->flag&MM_F_SPLICE) && (opt->flag&MM_F_SPLICE_FOR) && (opt->flag&MM_F_SPLICE_REV)) {
|
||||
mm_reg1_t s[2], s2[2];
|
||||
int which, trans_strand;
|
||||
s[0] = s[1] = regs[i];
|
||||
mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], a, &ez, MM_F_SPLICE_FOR);
|
||||
mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], a, &ez, MM_F_SPLICE_REV);
|
||||
if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1;
|
||||
else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2;
|
||||
else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively
|
||||
if (which == 0) {
|
||||
regs[i] = s[0], r2 = s2[0];
|
||||
free(s[1].p);
|
||||
} else {
|
||||
regs[i] = s[1], r2 = s2[1];
|
||||
free(s[0].p);
|
||||
}
|
||||
regs[i].p->trans_strand = trans_strand;
|
||||
} else {
|
||||
mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, a, &ez, opt->flag);
|
||||
if ((opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SPLICE_BOTH))
|
||||
regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2;
|
||||
}
|
||||
if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
|
||||
if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) {
|
||||
regs = mm_insert_reg(&r2, i, &n_regs, regs);
|
||||
|
|
|
|||
4
bseq.c
4
bseq.c
|
|
@ -8,7 +8,6 @@
|
|||
KSEQ_INIT(gzFile, gzread)
|
||||
|
||||
struct mm_bseq_file_s {
|
||||
int is_eof;
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
};
|
||||
|
|
@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
|
|||
size += seqs[n++].l_seq;
|
||||
if (size >= chunk_size) break;
|
||||
}
|
||||
if (size < chunk_size) fp->is_eof = 1;
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
}
|
||||
|
||||
int mm_bseq_eof(mm_bseq_file_t *fp)
|
||||
{
|
||||
return fp->is_eof;
|
||||
return ks_eof(fp->ks->f);
|
||||
}
|
||||
|
|
|
|||
14
chain.c
14
chain.c
|
|
@ -19,7 +19,7 @@ static inline int ilog2_32(uint32_t v)
|
|||
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
|
||||
}
|
||||
|
||||
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km)
|
||||
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km)
|
||||
{ // TODO: make sure this works when n has more than 32 bits
|
||||
int32_t st = 0, k, *f, *p, *t, *v, n_u, n_v;
|
||||
int64_t i, j;
|
||||
|
|
@ -42,17 +42,23 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int
|
|||
uint64_t ri = a[i].x;
|
||||
int32_t qi = (int32_t)a[i].y, q_span = a[i].y>>32&0xff; // NB: only 8 bits of span is used!!!
|
||||
int32_t max_f = q_span, max_j = -1, n_skip = 0, min_d, max_f_past = -INT32_MAX;
|
||||
while (st < i && ri - a[st].x > max_dist) ++st;
|
||||
while (st < i && ri - a[st].x > max_dist_x) ++st;
|
||||
for (j = i - 1; j >= st; --j) {
|
||||
int64_t dr = ri - a[j].x;
|
||||
int32_t dq = qi - (int32_t)a[j].y, dd, sc;
|
||||
if (dr == 0 || dq <= 0 || dq > max_dist) continue;
|
||||
if (dr == 0 || dq <= 0 || dq > max_dist_y) continue;
|
||||
dd = dr > dq? dr - dq : dq - dr;
|
||||
if (dd > bw) continue;
|
||||
max_f_past = max_f_past > f[j]? max_f_past : f[j];
|
||||
min_d = dq < dr? dq : dr;
|
||||
sc = min_d > q_span? q_span : dq < dr? dq : dr;
|
||||
sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
|
||||
if (is_cdna) {
|
||||
int c_log, c_lin;
|
||||
c_lin = (int)(dd * .01 * avg_qspan);
|
||||
c_log = ilog2_32(dd);
|
||||
if (dr > dq) sc -= c_lin < c_log? c_lin : c_log;
|
||||
else sc -= c_lin + (c_log>>1);
|
||||
} else sc -= (int)(dd * .01 * avg_qspan) + (ilog2_32(dd)>>1);
|
||||
sc += f[j];
|
||||
if (sc > max_f) {
|
||||
max_f = sc, max_j = j;
|
||||
|
|
|
|||
84
format.c
84
format.c
|
|
@ -6,6 +6,8 @@
|
|||
#include "kalloc.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
static char mm_rg_id[256];
|
||||
|
||||
static inline void str_enlarge(kstring_t *s, int l)
|
||||
{
|
||||
if (s->l + l + 1 > s->m) {
|
||||
|
|
@ -56,6 +58,70 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
|||
s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static char *mm_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
static void sam_write_rg_line(kstring_t *str, const char *s)
|
||||
{
|
||||
char *p, *q, *r, *rg_line = 0;
|
||||
memset(mm_rg_id, 0, 256);
|
||||
if (s == 0) return;
|
||||
if (strstr(s, "@RG") != s) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
if (strstr(s, "\t") != NULL) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
rg_line = strdup(s);
|
||||
mm_escape(rg_line);
|
||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
if (q - p + 1 > 256) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
mm_sprintf_lite(str, "%s\n", rg_line);
|
||||
|
||||
err_set_rg:
|
||||
free(rg_line);
|
||||
}
|
||||
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
|
||||
{
|
||||
kstring_t str = {0,0,0};
|
||||
sam_write_rg_line(&str, rg);
|
||||
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
|
||||
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
|
||||
if (argc > 1) {
|
||||
int i;
|
||||
mm_sprintf_lite(&str, "\tCL:minimap2");
|
||||
for (i = 1; i < argc; ++i)
|
||||
mm_sprintf_lite(&str, " %s", argv[i]);
|
||||
}
|
||||
mm_sprintf_lite(&str, "\n");
|
||||
fputs(str.s, stdout);
|
||||
free(str.s);
|
||||
}
|
||||
|
||||
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
|
@ -119,7 +185,11 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
|
|||
mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
|
||||
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
|
||||
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
|
||||
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
|
||||
if (r->p) {
|
||||
mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);
|
||||
if (r->p->trans_strand == 1 || r->p->trans_strand == 2)
|
||||
mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]);
|
||||
}
|
||||
}
|
||||
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag)
|
||||
|
|
@ -137,7 +207,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
uint32_t k;
|
||||
mm_sprintf_lite(s, "\tcg:Z:");
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
|
||||
}
|
||||
if (r->p && (opt_flag & MM_F_OUT_CS))
|
||||
write_cs(km, s, mi, t, r);
|
||||
|
|
@ -154,6 +224,13 @@ static char comp_tab[] = {
|
|||
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
|
||||
};
|
||||
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx)
|
||||
{
|
||||
uint32_t i;
|
||||
for (i = 0; i < idx->n_seq; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
|
||||
}
|
||||
|
||||
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
||||
{
|
||||
if (rev) {
|
||||
|
|
@ -187,7 +264,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
int clip_char = (flag&0x800)? 'H' : 'S';
|
||||
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
|
||||
for (k = 0; k < r->p->n_cigar; ++k)
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]);
|
||||
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
|
||||
clip_len = r->rev? r->qs : t->l_seq - r->qe;
|
||||
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
|
||||
} else mm_sprintf_lite(s, "*");
|
||||
|
|
@ -206,6 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
else mm_sprintf_lite(s, "*");
|
||||
}
|
||||
write_tags(s, r);
|
||||
if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
|
||||
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
|
||||
int i, n_sa = 0; // n_sa: number of SA fields
|
||||
for (i = 0; i < n_regs; ++i)
|
||||
|
|
|
|||
2
index.c
2
index.c
|
|
@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
|
||||
{
|
||||
pipeline_t pl;
|
||||
if (fp == 0 || mm_bseq_eof(fp)) return 0;
|
||||
memset(&pl, 0, sizeof(pipeline_t));
|
||||
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
|
||||
pl.keep_name = keep_name;
|
||||
pl.batch_size = batch_size;
|
||||
pl.fp = fp;
|
||||
if (pl.fp == 0) return 0;
|
||||
pl.mi = mm_idx_init(w, k, b, is_hpc);
|
||||
|
||||
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
||||
|
|
|
|||
11
ksw2.h
11
ksw2.h
|
|
@ -12,6 +12,8 @@
|
|||
#define KSW_EZ_APPROX_DROP 0x10 // approximate Z-drop; faster with sse
|
||||
#define KSW_EZ_EXTZ_ONLY 0x40 // only perform extension
|
||||
#define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output
|
||||
#define KSW_EZ_SPLICE_FOR 0x100
|
||||
#define KSW_EZ_SPLICE_REV 0x200
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -53,6 +55,9 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
|
|||
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
|
||||
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
|
||||
|
||||
void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
|
||||
int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez);
|
||||
|
||||
void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
|
||||
|
||||
/**
|
||||
|
|
@ -106,7 +111,8 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
|
|||
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
|
||||
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
|
||||
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int with_N, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
|
||||
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
{ // p[] - lower 3 bits: which type gets the max; bit
|
||||
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
|
||||
uint32_t *cigar = *cigar_, tmp;
|
||||
|
|
@ -127,7 +133,8 @@ static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t
|
|||
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
|
||||
if (force_state >= 0) state = force_state;
|
||||
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
|
||||
else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else if (state == 1 || (state == 3 && !with_N)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else if (state == 3 && with_N) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
|
||||
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
|
||||
}
|
||||
if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, i + 1); // first deletion
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
|
@ -66,7 +67,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (w < 0) w = tlen > qlen? tlen : qlen;
|
||||
wl = wr = w;
|
||||
tlen_ = (tlen + 15) / 16;
|
||||
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
|
||||
n_col_ = qlen < tlen? qlen : tlen;
|
||||
n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
|
||||
qlen_ = (qlen + 15) / 16;
|
||||
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
|
||||
max_sc = max_sc > mat[t]? max_sc : mat[t];
|
||||
|
|
@ -161,6 +163,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
x21_ = _mm_cvtsi32_si128((uint8_t)x21);
|
||||
v1_ = _mm_cvtsi32_si128((uint8_t)v1);
|
||||
st_ = st / 16, en_ = en / 16;
|
||||
assert(en_ - st_ + 1 <= n_col_);
|
||||
if (!with_cigar) { // score only
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
|
||||
|
|
@ -362,9 +365,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,357 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#include <emmintrin.h>
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
|
||||
int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez)
|
||||
{
|
||||
#define __dp_code_block1 \
|
||||
z = _mm_load_si128(&s[t]); \
|
||||
xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \
|
||||
tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \
|
||||
xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
|
||||
x1_ = tmp; \
|
||||
vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \
|
||||
tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \
|
||||
vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
|
||||
v1_ = tmp; \
|
||||
a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
|
||||
ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \
|
||||
b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
|
||||
x2t1= _mm_load_si128(&x2[t]); \
|
||||
tmp = _mm_srli_si128(x2t1, 15); \
|
||||
x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
|
||||
x21_= tmp; \
|
||||
a2 = _mm_add_epi8(x2t1, vt1); \
|
||||
a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t]));
|
||||
|
||||
#define __dp_code_block2 \
|
||||
_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
|
||||
_mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
|
||||
tmp = _mm_sub_epi8(z, q_); \
|
||||
a = _mm_sub_epi8(a, tmp); \
|
||||
b = _mm_sub_epi8(b, tmp); \
|
||||
a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
|
||||
|
||||
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
|
||||
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
|
||||
int32_t *H = 0, H0 = 0, last_H0_t = 0;
|
||||
uint8_t *qr, *sf, *mem, *mem2 = 0;
|
||||
__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, m1_;
|
||||
__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
|
||||
|
||||
ksw_reset_extz(ez);
|
||||
if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
|
||||
|
||||
zero_ = _mm_set1_epi8(0);
|
||||
q_ = _mm_set1_epi8(q);
|
||||
q2_ = _mm_set1_epi8(q2);
|
||||
qe_ = _mm_set1_epi8(q + e);
|
||||
sc_mch_ = _mm_set1_epi8(mat[0]);
|
||||
sc_mis_ = _mm_set1_epi8(mat[1]);
|
||||
m1_ = _mm_set1_epi8(m - 1); // wildcard
|
||||
|
||||
tlen_ = (tlen + 15) / 16;
|
||||
n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1;
|
||||
qlen_ = (qlen + 15) / 16;
|
||||
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
|
||||
max_sc = max_sc > mat[t]? max_sc : mat[t];
|
||||
min_sc = min_sc < mat[t]? min_sc : mat[t];
|
||||
}
|
||||
if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
|
||||
|
||||
long_thres = (q2 - q) / e - 1;
|
||||
if (q2 > q + e + long_thres * e)
|
||||
++long_thres;
|
||||
long_diff = long_thres * e - (q2 - q);
|
||||
|
||||
mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16);
|
||||
u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
|
||||
v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_;
|
||||
donor = x2 + tlen_, acceptor = donor + tlen_;
|
||||
s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
|
||||
memset(u, -q - e, tlen_ * 16 * 4); // this set u, v, x, y (because they are in the same array)
|
||||
memset(x2, -q2, tlen_ * 16);
|
||||
if (!approx_max) {
|
||||
H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
|
||||
for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
|
||||
}
|
||||
if (with_cigar) {
|
||||
mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
|
||||
p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
|
||||
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
|
||||
off_end = off + qlen + tlen - 1;
|
||||
}
|
||||
|
||||
for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
|
||||
memcpy(sf, target, tlen);
|
||||
|
||||
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
|
||||
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
|
||||
memset(donor, -noncan, tlen_ * 16);
|
||||
for (t = 0; t < tlen - 2; ++t) {
|
||||
int is_can = 0; // is a canonical site
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1;
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1;
|
||||
if (is_can) ((int8_t*)donor)[t] = 0;
|
||||
}
|
||||
memset(acceptor, -noncan, tlen_ * 16);
|
||||
for (t = 2; t < tlen; ++t) {
|
||||
int is_can = 0;
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1;
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1;
|
||||
if (is_can) ((int8_t*)acceptor)[t] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
|
||||
int st = 0, en = tlen - 1, st0, en0, st_, en_;
|
||||
int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v;
|
||||
uint8_t *qrr = qr + (qlen - 1 - r);
|
||||
__m128i x1_, x21_, v1_;
|
||||
// find the boundaries
|
||||
if (st < r - qlen + 1) st = r - qlen + 1;
|
||||
if (en > r) en = r;
|
||||
st0 = st, en0 = en;
|
||||
st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
|
||||
// set boundary conditions
|
||||
if (st > 0) {
|
||||
if (st - 1 >= last_st && st - 1 <= last_en)
|
||||
x1 = ((int8_t*)x)[st - 1], x21 = ((int8_t*)x2)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
|
||||
else x1 = -q - e, x21 = -q2, v1 = -q - e;
|
||||
} else {
|
||||
x1 = -q - e, x21 = -q2;
|
||||
v1 = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
|
||||
}
|
||||
if (en >= r) {
|
||||
((int8_t*)y)[r] = -q - e;
|
||||
u8[r] = r == 0? -q - e : r < long_thres? -e : r == long_thres? long_diff : 0;
|
||||
}
|
||||
// loop fission: set scores first
|
||||
if (!(flag & KSW_EZ_GENERIC_SC)) {
|
||||
for (t = st0; t <= en0; t += 16) {
|
||||
__m128i sq, st, tmp, mask;
|
||||
sq = _mm_loadu_si128((__m128i*)&sf[t]);
|
||||
st = _mm_loadu_si128((__m128i*)&qrr[t]);
|
||||
mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
|
||||
tmp = _mm_cmpeq_epi8(sq, st);
|
||||
#ifdef __SSE4_1__
|
||||
tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
|
||||
#else
|
||||
tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
|
||||
#endif
|
||||
tmp = _mm_andnot_si128(mask, tmp);
|
||||
_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
|
||||
}
|
||||
} else {
|
||||
for (t = st0; t <= en0; ++t)
|
||||
((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
|
||||
}
|
||||
// core loop
|
||||
x1_ = _mm_cvtsi32_si128((uint8_t)x1);
|
||||
x21_ = _mm_cvtsi32_si128((uint8_t)x21);
|
||||
v1_ = _mm_cvtsi32_si128((uint8_t)v1);
|
||||
st_ = st / 16, en_ = en / 16;
|
||||
assert(en_ - st_ + 1 <= n_col_);
|
||||
if (!with_cigar) { // score only
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
|
||||
__dp_code_block1;
|
||||
#ifdef __SSE4_1__
|
||||
z = _mm_max_epi8(z, a);
|
||||
z = _mm_max_epi8(z, b);
|
||||
z = _mm_max_epi8(z, a2a);
|
||||
__dp_code_block2; // save u[] and v[]; update a, b and a2
|
||||
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_max_epi8(a, zero_), qe_));
|
||||
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_max_epi8(b, zero_), qe_));
|
||||
tmp = _mm_load_si128(&donor[t]);
|
||||
_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
|
||||
#else
|
||||
tmp = _mm_cmpgt_epi8(a, z);
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
|
||||
tmp = _mm_cmpgt_epi8(b, z);
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
|
||||
tmp = _mm_cmpgt_epi8(a2a, z);
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
|
||||
__dp_code_block2;
|
||||
tmp = _mm_cmpgt_epi8(a, zero_);
|
||||
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
|
||||
tmp = _mm_cmpgt_epi8(b, zero_);
|
||||
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
|
||||
tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
|
||||
tmp = _mm_cmpgt_epi8(a2, tmp);
|
||||
tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
|
||||
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
|
||||
#endif
|
||||
}
|
||||
} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
|
||||
__dp_code_block1;
|
||||
#ifdef __SSE4_1__
|
||||
d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1)); // d = a > z? 1 : 0
|
||||
z = _mm_max_epi8(z, a);
|
||||
d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b, z)); // d = b > z? 2 : d
|
||||
z = _mm_max_epi8(z, b);
|
||||
d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
|
||||
z = _mm_max_epi8(z, a2a);
|
||||
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
|
||||
tmp = _mm_cmpgt_epi8(a, z);
|
||||
d = _mm_and_si128(tmp, _mm_set1_epi8(1));
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
|
||||
tmp = _mm_cmpgt_epi8(b, z);
|
||||
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
|
||||
tmp = _mm_cmpgt_epi8(a2a, z);
|
||||
d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
|
||||
z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
|
||||
#endif
|
||||
__dp_code_block2;
|
||||
tmp = _mm_cmpgt_epi8(a, zero_);
|
||||
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_and_si128(tmp, a), qe_));
|
||||
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
|
||||
tmp = _mm_cmpgt_epi8(b, zero_);
|
||||
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_and_si128(tmp, b), qe_));
|
||||
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
|
||||
|
||||
tmp2 = _mm_load_si128(&donor[t]);
|
||||
tmp = _mm_cmpgt_epi8(a2, tmp2);
|
||||
#ifdef __SSE4_1__
|
||||
tmp2 = _mm_max_epi8(a2, tmp2);
|
||||
#else
|
||||
tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
|
||||
#endif
|
||||
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
|
||||
d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
|
||||
_mm_store_si128(&pr[t], d);
|
||||
}
|
||||
} else { // gap right-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
|
||||
__dp_code_block1;
|
||||
#ifdef __SSE4_1__
|
||||
d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1)); // d = z > a? 0 : 1
|
||||
z = _mm_max_epi8(z, a);
|
||||
d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b)); // d = z > b? d : 2
|
||||
z = _mm_max_epi8(z, b);
|
||||
d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
|
||||
z = _mm_max_epi8(z, a2a);
|
||||
#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
|
||||
tmp = _mm_cmpgt_epi8(z, a);
|
||||
d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
|
||||
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
|
||||
tmp = _mm_cmpgt_epi8(z, b);
|
||||
d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
|
||||
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
|
||||
tmp = _mm_cmpgt_epi8(z, a2a);
|
||||
d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
|
||||
z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
|
||||
#endif
|
||||
__dp_code_block2;
|
||||
tmp = _mm_cmpgt_epi8(zero_, a);
|
||||
_mm_store_si128(&x[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a), qe_));
|
||||
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
|
||||
tmp = _mm_cmpgt_epi8(zero_, b);
|
||||
_mm_store_si128(&y[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b), qe_));
|
||||
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
|
||||
|
||||
tmp2 = _mm_load_si128(&donor[t]);
|
||||
tmp = _mm_cmpgt_epi8(tmp2, a2);
|
||||
#ifdef __SSE4_1__
|
||||
tmp2 = _mm_max_epi8(tmp2, a2);
|
||||
#else
|
||||
tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
|
||||
#endif
|
||||
_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
|
||||
d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
|
||||
_mm_store_si128(&pr[t], d);
|
||||
}
|
||||
}
|
||||
if (!approx_max) { // find the exact max with a 32-bit score array
|
||||
int32_t max_H, max_t;
|
||||
// compute H[], max_H and max_t
|
||||
if (r > 0) {
|
||||
int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
|
||||
__m128i max_H_, max_t_;
|
||||
max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
|
||||
max_t = en0;
|
||||
max_H_ = _mm_set1_epi32(max_H);
|
||||
max_t_ = _mm_set1_epi32(max_t);
|
||||
for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
|
||||
__m128i H1, tmp, t_;
|
||||
H1 = _mm_loadu_si128((__m128i*)&H[t]);
|
||||
t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
|
||||
H1 = _mm_add_epi32(H1, t_);
|
||||
_mm_storeu_si128((__m128i*)&H[t], H1);
|
||||
t_ = _mm_set1_epi32(t);
|
||||
tmp = _mm_cmpgt_epi32(H1, max_H_);
|
||||
#ifdef __SSE4_1__
|
||||
max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
|
||||
max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
|
||||
#else
|
||||
max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
|
||||
max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
|
||||
#endif
|
||||
}
|
||||
_mm_storeu_si128((__m128i*)HH, max_H_);
|
||||
_mm_storeu_si128((__m128i*)tt, max_t_);
|
||||
for (i = 0; i < 4; ++i)
|
||||
if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
|
||||
for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
|
||||
H[t] += (int32_t)v8[t];
|
||||
if (H[t] > max_H)
|
||||
max_H = H[t], max_t = t;
|
||||
}
|
||||
} else H[0] = v8[0] - qe, max_H = H[0], max_t = 0; // special casing r==0
|
||||
// update ez
|
||||
if (en0 == tlen - 1 && H[en0] > ez->mte)
|
||||
ez->mte = H[en0], ez->mte_q = r - en;
|
||||
if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
|
||||
ez->mqe = H[st0], ez->mqe_t = st0;
|
||||
if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, 0)) break;
|
||||
if (r == qlen + tlen - 2 && en0 == tlen - 1)
|
||||
ez->score = H[tlen - 1];
|
||||
} else { // find approximate max; Z-drop might be inaccurate, too.
|
||||
if (r > 0) {
|
||||
if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
|
||||
int32_t d0 = v8[last_H0_t];
|
||||
int32_t d1 = u8[last_H0_t + 1];
|
||||
if (d0 > d1) H0 += d0;
|
||||
else H0 += d1, ++last_H0_t;
|
||||
} else if (last_H0_t >= st0 && last_H0_t <= en0) {
|
||||
H0 += v8[last_H0_t];
|
||||
} else {
|
||||
++last_H0_t, H0 += u8[last_H0_t];
|
||||
}
|
||||
} else H0 = v8[0] - qe, last_H0_t = 0;
|
||||
if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, 0)) break;
|
||||
if (r == qlen + tlen - 2 && en0 == tlen - 1)
|
||||
ez->score = H0;
|
||||
}
|
||||
last_st = st, last_en = en;
|
||||
//for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
|
||||
}
|
||||
kfree(km, mem);
|
||||
if (!approx_max) kfree(km, H);
|
||||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, 1, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
#endif // __SSE2__
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "ksw2.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
|
@ -58,7 +59,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (w < 0) w = tlen > qlen? tlen : qlen;
|
||||
wl = wr = w;
|
||||
tlen_ = (tlen + 15) / 16;
|
||||
n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1;
|
||||
n_col_ = qlen < tlen? qlen : tlen;
|
||||
n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
|
||||
qlen_ = (qlen + 15) / 16;
|
||||
for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
|
||||
max_sc = max_sc > mat[t]? max_sc : mat[t];
|
||||
|
|
@ -130,6 +132,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
x1_ = _mm_cvtsi32_si128(x1);
|
||||
v1_ = _mm_cvtsi32_si128(v1);
|
||||
st_ = st / 16, en_ = en / 16;
|
||||
assert(en_ - st_ + 1 <= n_col_);
|
||||
if (!with_cigar) { // score only
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i z, a, b, xt1, vt1, ut, tmp;
|
||||
|
|
@ -275,9 +278,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
178
main.c
178
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0rc1-r271-dirty"
|
||||
#define MM_VERSION "2.1-r311"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
@ -31,6 +31,11 @@ static struct option long_options[] = {
|
|||
{ "max-chain-skip", required_argument, 0, 0 },
|
||||
{ "min-dp-len", required_argument, 0, 0 },
|
||||
{ "print-aln-seq", no_argument, 0, 0 },
|
||||
{ "splice", no_argument, 0, 0 },
|
||||
{ "cost-non-gt-ag", required_argument, 0, 0 },
|
||||
{ "no-sam-sq", no_argument, 0, 0 },
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
{ "min-count", required_argument, 0, 'n' },
|
||||
{ "min-chain-score",required_argument, 0, 'm' },
|
||||
|
|
@ -40,30 +45,42 @@ static struct option long_options[] = {
|
|||
{ 0, 0, 0, 0}
|
||||
};
|
||||
|
||||
static inline int64_t mm_parse_num(const char *str)
|
||||
{
|
||||
double x;
|
||||
char *p;
|
||||
x = strtod(optarg, &p);
|
||||
if (*p == 'G' || *p == 'g') x *= 1e9;
|
||||
else if (*p == 'M' || *p == 'm') x *= 1e6;
|
||||
else if (*p == 'K' || *p == 'k') x *= 1e3;
|
||||
return (int64_t)(x + .499);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
mm_mapopt_t opt;
|
||||
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0;
|
||||
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
|
||||
int minibatch_size = 200000000;
|
||||
uint64_t batch_size = 4000000000ULL;
|
||||
mm_bseq_file_t *fp = 0;
|
||||
char *fnw = 0, *s;
|
||||
FILE *fpr = 0, *fpw = 0;
|
||||
char *fnw = 0, *rg = 0, *s;
|
||||
FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
|
||||
|
||||
liftrlimit();
|
||||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
|
||||
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
|
||||
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
|
||||
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
|
||||
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
|
||||
else if (c == 'r') opt.bw = atoi(optarg);
|
||||
else if (c == 'r') opt.bw = (int)mm_parse_num(optarg);
|
||||
else if (c == 'f') opt.mid_occ_frac = atof(optarg);
|
||||
else if (c == 't') n_threads = atoi(optarg);
|
||||
else if (c == 'v') mm_verbose = atoi(optarg);
|
||||
else if (c == 'g') opt.max_gap = atoi(optarg);
|
||||
else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
|
||||
else if (c == 'G') max_intron_len = (int)mm_parse_num(optarg);
|
||||
else if (c == 'N') opt.best_n = atoi(optarg);
|
||||
else if (c == 'p') opt.pri_ratio = atof(optarg);
|
||||
else if (c == 'M') opt.mask_level = atof(optarg);
|
||||
|
|
@ -79,6 +96,10 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'B') opt.b = atoi(optarg);
|
||||
else if (c == 'z') opt.zdrop = atoi(optarg);
|
||||
else if (c == 's') opt.min_dp_max = atoi(optarg);
|
||||
else if (c == 'I') batch_size = mm_parse_num(optarg);
|
||||
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
|
||||
else if (c == 'R') rg = optarg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
|
||||
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
|
||||
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
|
||||
|
|
@ -88,24 +109,28 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
|
||||
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
|
||||
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
|
||||
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
|
||||
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
|
||||
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
|
||||
else if (c == 'V') {
|
||||
puts(MM_VERSION);
|
||||
return 0;
|
||||
} else if (c == 'u') {
|
||||
if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV;
|
||||
else if (*optarg == 'B') opt.flag |= MM_F_SPLICE_BOTH;
|
||||
else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV;
|
||||
else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR;
|
||||
else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
|
||||
else {
|
||||
fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
} else if (c == 'O') {
|
||||
opt.q = opt.q2 = strtol(optarg, &s, 10);
|
||||
if (*s == ',') opt.q2 = strtol(s + 1, &s, 10);
|
||||
} else if (c == 'E') {
|
||||
opt.e = opt.e2 = strtol(optarg, &s, 10);
|
||||
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
|
||||
} else if (c == 'I' || c == 'K') {
|
||||
double x;
|
||||
char *p;
|
||||
x = strtod(optarg, &p);
|
||||
if (*p == 'G' || *p == 'g') x *= 1e9;
|
||||
else if (*p == 'M' || *p == 'm') x *= 1e6;
|
||||
else if (*p == 'K' || *p == 'k') x *= 1e3;
|
||||
if (c == 'I') batch_size = (uint64_t)(x + .499);
|
||||
else minibatch_size = (uint64_t)(x + .499);
|
||||
} else if (c == 'x') {
|
||||
if (strcmp(optarg, "ava-ont") == 0) {
|
||||
opt.flag |= MM_F_AVA | MM_F_NO_SELF;
|
||||
|
|
@ -139,6 +164,13 @@ int main(int argc, char *argv[])
|
|||
opt.min_dp_max = 50;
|
||||
opt.best_n = 10;
|
||||
opt.bw = 100;
|
||||
} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
|
||||
k = 15, w = 5;
|
||||
opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
|
||||
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000;
|
||||
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
|
||||
opt.noncan = 5;
|
||||
opt.zdrop = 200;
|
||||
} else {
|
||||
fprintf(stderr, "[E::%s] unknown preset '%s'\n", __func__, optarg);
|
||||
return 1;
|
||||
|
|
@ -146,73 +178,87 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
if (w < 0) w = (int)(.6666667 * k + .499);
|
||||
if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
|
||||
opt.max_gap_ref = opt.bw = max_intron_len;
|
||||
|
||||
if (argc == optind) {
|
||||
fprintf(stderr, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
|
||||
fprintf(stderr, "Options:\n");
|
||||
fprintf(stderr, " Indexing:\n");
|
||||
fprintf(stderr, " -H use homopolymer-compressed k-mer\n");
|
||||
fprintf(stderr, " -k INT k-mer size (no larger than 28) [%d]\n", k);
|
||||
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
|
||||
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
|
||||
fprintf(stderr, " -d FILE dump index to FILE []\n");
|
||||
fprintf(stderr, " Mapping:\n");
|
||||
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
|
||||
fprintf(stderr, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
|
||||
fprintf(stderr, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
|
||||
fprintf(stderr, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
|
||||
fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
|
||||
// fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
|
||||
fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n");
|
||||
fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
|
||||
fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
|
||||
fprintf(stderr, " Alignment:\n");
|
||||
fprintf(stderr, " -A INT matching score [%d]\n", opt.a);
|
||||
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b);
|
||||
fprintf(stderr, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
|
||||
fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
|
||||
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop);
|
||||
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
|
||||
fprintf(stderr, " Input/Output:\n");
|
||||
fprintf(stderr, " -Q ignore base quality in the input\n");
|
||||
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
|
||||
fprintf(stderr, " -c output CIGAR in PAF\n");
|
||||
fprintf(stderr, " -S output the cs tag in PAF\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
|
||||
fprintf(stderr, " -K NUM minibatch size [200M]\n");
|
||||
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
|
||||
fprintf(stderr, " -V show version number\n");
|
||||
fprintf(stderr, " Preset:\n");
|
||||
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
|
||||
fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
|
||||
fprintf(stderr, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
|
||||
fprintf(stderr, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
|
||||
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
|
||||
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
|
||||
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
|
||||
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
|
||||
return 1;
|
||||
if (argc == optind || fp_help == stdout) {
|
||||
fprintf(fp_help, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
|
||||
fprintf(fp_help, "Options:\n");
|
||||
fprintf(fp_help, " Indexing:\n");
|
||||
fprintf(fp_help, " -H use homopolymer-compressed k-mer\n");
|
||||
fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k);
|
||||
fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n");
|
||||
fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
|
||||
fprintf(fp_help, " -d FILE dump index to FILE []\n");
|
||||
fprintf(fp_help, " Mapping:\n");
|
||||
fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
|
||||
fprintf(fp_help, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
|
||||
fprintf(fp_help, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
|
||||
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
|
||||
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
|
||||
// fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
|
||||
fprintf(fp_help, " -X skip self and dual mappings (for the all-vs-all mode)\n");
|
||||
fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
|
||||
fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
|
||||
fprintf(fp_help, " -G NUM max intron length (only effective following -x splice) [200k]\n");
|
||||
fprintf(fp_help, " Alignment:\n");
|
||||
fprintf(fp_help, " -A INT matching score [%d]\n", opt.a);
|
||||
fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b);
|
||||
fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
|
||||
fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
|
||||
fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop);
|
||||
fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
|
||||
fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
|
||||
fprintf(fp_help, " Input/Output:\n");
|
||||
fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
|
||||
fprintf(fp_help, " -Q don't output base quality in SAM\n");
|
||||
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
||||
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
||||
fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n");
|
||||
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
|
||||
fprintf(fp_help, " -K NUM minibatch size [200M]\n");
|
||||
// fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose);
|
||||
fprintf(fp_help, " --version show version number\n");
|
||||
fprintf(fp_help, " Preset:\n");
|
||||
fprintf(fp_help, " -x STR preset (recommended to be applied before other options) []\n");
|
||||
fprintf(fp_help, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
|
||||
fprintf(fp_help, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
|
||||
fprintf(fp_help, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
|
||||
fprintf(fp_help, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
|
||||
fprintf(fp_help, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
|
||||
fprintf(fp_help, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
|
||||
fprintf(fp_help, " splice: long-read spliced alignment (see minimap2.1 for details)\n");
|
||||
fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
|
||||
return fp_help == stdout? 0 : 1;
|
||||
}
|
||||
|
||||
is_idx = mm_idx_is_idx(argv[optind]);
|
||||
if (is_idx < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
|
||||
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
|
||||
return 1;
|
||||
}
|
||||
if (!is_idx && fnw == 0 && argc - optind < 2) {
|
||||
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
|
||||
return 1;
|
||||
}
|
||||
if (is_idx) fpr = fopen(argv[optind], "rb");
|
||||
else fp = mm_bseq_open(argv[optind]);
|
||||
if (fnw) fpw = fopen(fnw, "wb");
|
||||
if (opt.flag & MM_F_OUT_SAM)
|
||||
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
|
||||
for (;;) {
|
||||
mm_idx_t *mi = 0;
|
||||
mm_idx_t *mi;
|
||||
if (fpr) {
|
||||
mi = mm_idx_load(fpr);
|
||||
if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
|
||||
fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
||||
} else if (!mm_bseq_eof(fp)) {
|
||||
fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
|
||||
} else {
|
||||
mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
|
||||
}
|
||||
if (mi == 0) break;
|
||||
++n_idx_part;
|
||||
if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
|
||||
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
|
||||
|
|
|
|||
24
map.c
24
map.c
|
|
@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->min_chain_score = 40;
|
||||
opt->bw = 500;
|
||||
opt->max_gap = 5000;
|
||||
opt->max_gap_ref = -1;
|
||||
opt->max_chain_skip = 25;
|
||||
|
||||
opt->mask_level = 0.5f;
|
||||
|
|
@ -37,6 +38,8 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
||||
{
|
||||
if (opt->flag & MM_F_SPLICE_BOTH)
|
||||
opt->flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
|
||||
opt->max_occ = mm_idx_cal_max_occ(mi, opt->max_occ_frac);
|
||||
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
|
||||
if (mm_verbose >= 3)
|
||||
|
|
@ -167,7 +170,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
|
|||
#endif
|
||||
mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen, const char *seq, int *n_regs)
|
||||
{
|
||||
int i, n = m_en - m_st, j, n_u;
|
||||
int i, n = m_en - m_st, j, n_u, max_gap_ref;
|
||||
int64_t n_a;
|
||||
uint64_t *u;
|
||||
mm_match_t *m;
|
||||
|
|
@ -243,7 +246,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
|
|||
fprintf(stderr, "SD\t%s\t%d\t%c\t%d\t%d\t%d\n", mi->seq[a[i].x<<1>>33].name, (int32_t)a[i].x, "+-"[a[i].x>>63], (int32_t)a[i].y, (int32_t)(a[i].y>>32&0xff),
|
||||
i == 0? 0 : ((int32_t)a[i].y - (int32_t)a[i-1].y) - ((int32_t)a[i].x - (int32_t)a[i-1].x));
|
||||
|
||||
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
|
||||
max_gap_ref = opt->max_gap_ref >= 0? opt->max_gap_ref : opt->max_gap;
|
||||
n_u = mm_chain_dp(max_gap_ref, opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, !!(opt->flag&MM_F_SPLICE), n_a, a, &u, b->km);
|
||||
regs = mm_gen_regs(b->km, qlen, n_u, u, a);
|
||||
*n_regs = n_u;
|
||||
|
||||
|
|
@ -256,7 +260,8 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
|
|||
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
|
||||
mm_set_parent(b->km, opt->mask_level, *n_regs, regs);
|
||||
mm_select_sub(b->km, opt->mask_level, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
|
||||
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
|
||||
if (!(opt->flag & MM_F_SPLICE))
|
||||
mm_join_long(b->km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
|
||||
}
|
||||
if (opt->flag & MM_F_CIGAR) {
|
||||
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs()
|
||||
|
|
@ -348,8 +353,10 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
mm_bseq1_t *t = &s->seq[i];
|
||||
for (j = 0; j < s->n_reg[i]; ++j) {
|
||||
mm_reg1_t *r = &s->reg[i][j];
|
||||
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
|
||||
else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
|
||||
if (p->opt->flag & MM_F_OUT_SAM)
|
||||
mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]);
|
||||
else
|
||||
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
|
||||
puts(p->str.s);
|
||||
}
|
||||
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) {
|
||||
|
|
@ -378,11 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
|
|||
if (pl.fp == 0) return -1;
|
||||
pl.opt = opt, pl.mi = idx;
|
||||
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
|
||||
if (opt->flag & MM_F_OUT_SAM) {
|
||||
uint32_t i;
|
||||
for (i = 0; i < idx->n_seq; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
|
||||
}
|
||||
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
|
||||
mm_write_sam_SQ(idx);
|
||||
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
|
||||
free(pl.str.s);
|
||||
mm_bseq_close(pl.fp);
|
||||
|
|
|
|||
27
minimap.h
27
minimap.h
|
|
@ -5,15 +5,20 @@
|
|||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#define MM_IDX_DEF_B 14
|
||||
#define MM_IDX_DEF_B 14
|
||||
|
||||
#define MM_F_NO_SELF 0x01
|
||||
#define MM_F_AVA 0x02
|
||||
#define MM_F_CIGAR 0x04
|
||||
#define MM_F_OUT_SAM 0x08
|
||||
#define MM_F_NO_QUAL 0x10
|
||||
#define MM_F_OUT_CG 0x20
|
||||
#define MM_F_OUT_CS 0x40
|
||||
#define MM_F_NO_SELF 0x001
|
||||
#define MM_F_AVA 0x002
|
||||
#define MM_F_CIGAR 0x004
|
||||
#define MM_F_OUT_SAM 0x008
|
||||
#define MM_F_NO_QUAL 0x010
|
||||
#define MM_F_OUT_CG 0x020
|
||||
#define MM_F_OUT_CS 0x040
|
||||
#define MM_F_SPLICE 0x080
|
||||
#define MM_F_SPLICE_FOR 0x100
|
||||
#define MM_F_SPLICE_REV 0x200
|
||||
#define MM_F_SPLICE_BOTH 0x400
|
||||
#define MM_F_NO_SAM_SQ 0x800
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
|
|
@ -55,7 +60,8 @@ typedef struct {
|
|||
uint32_t capacity;
|
||||
int32_t dp_score, dp_max, dp_max2;
|
||||
uint32_t blen;
|
||||
uint32_t n_diff, n_ambi;
|
||||
uint32_t n_diff;
|
||||
uint32_t n_ambi:30, trans_strand:2;
|
||||
uint32_t n_cigar;
|
||||
uint32_t cigar[];
|
||||
} mm_extra_t;
|
||||
|
|
@ -80,7 +86,7 @@ typedef struct {
|
|||
int flag; // see MM_F_* macros
|
||||
|
||||
int bw; // bandwidth
|
||||
int max_gap; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
|
||||
int max_chain_skip;
|
||||
int min_cnt;
|
||||
int min_chain_score;
|
||||
|
|
@ -93,6 +99,7 @@ typedef struct {
|
|||
int min_join_flank_sc;
|
||||
|
||||
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
|
||||
int noncan;
|
||||
int zdrop;
|
||||
int min_dp_max;
|
||||
int min_ksw_len;
|
||||
|
|
|
|||
95
minimap2.1
95
minimap2.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH minimap2 1 "30 July 2017" "minimap2-2.0rc1-r232" "Bioinformatics tools"
|
||||
.TH minimap2 1 "25 August 2017" "minimap2-2.1-r311" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
minimap2 - mapping and alignment between collections of DNA sequences
|
||||
|
|
@ -162,6 +162,12 @@ secondary alignments [5]. This option has no effect when
|
|||
.B -X
|
||||
is applied.
|
||||
.TP
|
||||
.BI -G \ NUM
|
||||
Maximal intron length in the splice mode [200k]. This option also changes the
|
||||
bandwidth to
|
||||
.IR NUM .
|
||||
Increasing this option slows down spliced alignment.
|
||||
.TP
|
||||
.BI --max-chain-skip \ INT
|
||||
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
|
||||
for chaining. The time complexity is quadratic in the number of seeds. This
|
||||
|
|
@ -199,15 +205,32 @@ the contiguity of the alignment at the cost of poor alignment in the middle
|
|||
Minimal peak DP alignment score to output [40]. The peak score is computed from
|
||||
the final CIGAR. It is the score of the max scoring segment in the alignment
|
||||
and may be different from the total alignment score.
|
||||
.TP
|
||||
.BI -u \ CHAR
|
||||
How to find canonical splicing sites GT-AG -
|
||||
.BR f :
|
||||
transcript strand;
|
||||
.BR b :
|
||||
both strands;
|
||||
.BR n :
|
||||
no attempt to match GT-AG [n]
|
||||
.TP
|
||||
.BI --cost-non-gt-ag \ INT
|
||||
Cost of non-canonical splicing sites [0].
|
||||
.SS Input/output options
|
||||
.TP 10
|
||||
.B -Q
|
||||
Ignore base quality in the input file.
|
||||
.TP
|
||||
.B -a
|
||||
Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
|
||||
by default.
|
||||
.TP
|
||||
.B -Q
|
||||
Ignore base quality in the input file.
|
||||
.TP
|
||||
.BI -R \ STR
|
||||
SAM read group line in a format like
|
||||
.B @RG\\\\tID:foo\\\\tSM:bar
|
||||
[].
|
||||
.TP
|
||||
.B -c
|
||||
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
|
||||
.TP
|
||||
|
|
@ -232,8 +255,13 @@ and
|
|||
use
|
||||
.BR -K500m .
|
||||
.TP
|
||||
.B -V
|
||||
.B --version
|
||||
Print version number to stdout
|
||||
.TP
|
||||
.B --no-sam-hdr
|
||||
Don't output SAM header lines. Use this option if the index consists of
|
||||
multiple parts; otherwise the SAM output is malformated due to internal header
|
||||
lines.
|
||||
.SS Preset options
|
||||
.TP 10
|
||||
.BI -x \ STR
|
||||
|
|
@ -247,37 +275,64 @@ are:
|
|||
.RS
|
||||
.TP 8
|
||||
.B map-pb
|
||||
PacBio/Oxford Nanopore read to reference mapping (-Hk19)
|
||||
PacBio/Oxford Nanopore read to reference mapping
|
||||
.RB ( -Hk19 )
|
||||
.TP
|
||||
.B map10k
|
||||
The same as
|
||||
.B map-pb
|
||||
(-Hk19)
|
||||
.RB ( -Hk19 )
|
||||
.TP
|
||||
.B map-ont
|
||||
Slightly more sensitive for Oxford Nanopore to reference mapping (-k15). For
|
||||
PacBio reads, HPC minimizers consistently leads to faster performance and more
|
||||
sensitive results in comparison to normal minimizers. For Oxford Nanopore data,
|
||||
normal minimizers are better, though not much. The effectiveness of HPC is
|
||||
determined by the sequencing error mode.
|
||||
Slightly more sensitive for Oxford Nanopore to reference mapping
|
||||
.RB ( -k15 ).
|
||||
For PacBio reads, HPC minimizers consistently leads to faster performance and
|
||||
more sensitive results in comparison to normal minimizers. For Oxford Nanopore
|
||||
data, normal minimizers are better, though not much. The effectiveness of HPC
|
||||
is determined by the sequencing error mode.
|
||||
.TP
|
||||
.B asm5
|
||||
Long assembly to reference mapping (-k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200).
|
||||
Long assembly to reference mapping
|
||||
.RB ( -k19
|
||||
.B -w19 -A1 -B19 -O39,81 -E3,1 -s200
|
||||
.BR -z200 ).
|
||||
Typically, the alignment will not extend to regions with 5% or higher sequence
|
||||
divergence. Only use this preset if the average divergence is far below 5%.
|
||||
.TP
|
||||
.B asm10
|
||||
Long assembly to reference mapping (-k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200). Up
|
||||
to 10% sequence divergence.
|
||||
.TP 8
|
||||
Long assembly to reference mapping
|
||||
.RB ( -k19
|
||||
.B -w19 -A1 -B9 -O16,41 -E2,1 -s200
|
||||
.BR -z200 ).
|
||||
Up to 10% sequence divergence.
|
||||
.TP
|
||||
.B ava-pb
|
||||
PacBio all-vs-all overlap mapping (-Hk19 -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip 25)
|
||||
.TP 8
|
||||
PacBio all-vs-all overlap mapping
|
||||
.RB ( -Hk19
|
||||
.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
|
||||
.BR 25 ).
|
||||
.TP
|
||||
.B ava-ont
|
||||
Oxford Nanopore all-vs-all overlap mapping (-k15 -w5 -Xp0 -m100 -K500m -g10000
|
||||
--max-chain-skip 25). Similarly, the major difference from
|
||||
Oxford Nanopore all-vs-all overlap mapping
|
||||
.RB ( -k15
|
||||
.B -w5 -Xp0 -m100 -K500m -g10000 --max-chain-skip
|
||||
.BR 25 ).
|
||||
Similarly, the major difference from
|
||||
.B ava-pb
|
||||
is that this preset is not using HPC minimizers.
|
||||
.TP
|
||||
.B splice
|
||||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
|
||||
.BR 5 ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
.RB ` N '
|
||||
CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
|
||||
costs are different during chaining; 4) the computation of the
|
||||
.RB ` ms '
|
||||
tag ignores introns to demote hits to pseudogenes.
|
||||
.RE
|
||||
.SS Miscellaneous options
|
||||
.TP 10
|
||||
|
|
|
|||
|
|
@ -0,0 +1,266 @@
|
|||
/*******************************
|
||||
* Command line option parsing *
|
||||
*******************************/
|
||||
|
||||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
/***********************
|
||||
* Interval operations *
|
||||
***********************/
|
||||
|
||||
Interval = {};
|
||||
|
||||
Interval.sort = function(a)
|
||||
{
|
||||
if (typeof a[0] == 'number')
|
||||
a.sort(function(x, y) { return x - y });
|
||||
else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] });
|
||||
}
|
||||
|
||||
Interval.merge = function(a, sorted)
|
||||
{
|
||||
if (typeof sorted == 'undefined') sorted = true;
|
||||
if (!sorted) Interval.sort(a);
|
||||
var k = 0;
|
||||
for (var i = 1; i < a.length; ++i) {
|
||||
if (a[k][1] >= a[i][0])
|
||||
a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1];
|
||||
else a[++k] = a[i].slice(0);
|
||||
}
|
||||
a.length = k + 1;
|
||||
}
|
||||
|
||||
Interval.index_end = function(a, sorted)
|
||||
{
|
||||
if (a.length == 0) return;
|
||||
if (typeof sorted == 'undefined') sorted = true;
|
||||
if (!sorted) Interval.sort(a);
|
||||
a[0].push(0);
|
||||
var k = 0, k_en = a[0][1];
|
||||
for (var i = 1; i < a.length; ++i) {
|
||||
if (k_en <= a[i][0]) {
|
||||
for (++k; k < i; ++k)
|
||||
if (a[k][1] > a[i][0])
|
||||
break;
|
||||
k_en = a[k][1];
|
||||
}
|
||||
a[i].push(k);
|
||||
}
|
||||
}
|
||||
|
||||
Interval.find_intv = function(a, x)
|
||||
{
|
||||
var left = -1, right = a.length;
|
||||
if (typeof a[0] == 'number') {
|
||||
while (right - left > 1) {
|
||||
var mid = left + ((right - left) >> 1);
|
||||
if (a[mid] > x) right = mid;
|
||||
else if (a[mid] < x) left = mid;
|
||||
else return mid;
|
||||
}
|
||||
} else {
|
||||
while (right - left > 1) {
|
||||
var mid = left + ((right - left) >> 1);
|
||||
if (a[mid][0] > x) right = mid;
|
||||
else if (a[mid][0] < x) left = mid;
|
||||
else return mid;
|
||||
}
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
Interval.find_ovlp = function(a, st, en)
|
||||
{
|
||||
if (a.length == 0 || st >= en) return [];
|
||||
var l = Interval.find_intv(a, st);
|
||||
var k = l < 0? 0 : a[l][a[l].length - 1];
|
||||
var b = [];
|
||||
for (var i = k; i < a.length; ++i) {
|
||||
if (a[i][0] >= en) break;
|
||||
else if (st < a[i][1])
|
||||
b.push(a[i]);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Main function *
|
||||
*****************/
|
||||
|
||||
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false;
|
||||
while ((c = getopt(arguments, "l:ep")) != null) {
|
||||
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
|
||||
else if (c == 'e') print_err_only = print_ovlp = true;
|
||||
else if (c == 'p') print_ovlp = true;
|
||||
}
|
||||
|
||||
if (arguments.length - getopt.ind < 2) {
|
||||
print("Usage: k8 intron-eval.js [options] <gene.gtf> <aln.sam>");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
var file, buf = new Bytes();
|
||||
|
||||
var tr = {};
|
||||
file = new File(arguments[getopt.ind]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, t = buf.toString().split("\t");
|
||||
if (t[0].charAt(0) == '#') continue;
|
||||
if (t[2] != 'exon') continue;
|
||||
var st = parseInt(t[3]) - 1;
|
||||
var en = parseInt(t[4]);
|
||||
if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue;
|
||||
var tid = m[1];
|
||||
if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []];
|
||||
tr[tid][4].push([st, en]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
var anno = {};
|
||||
for (var tid in tr) {
|
||||
var t = tr[tid];
|
||||
Interval.sort(t[4]);
|
||||
t[2] = t[4][0][0];
|
||||
t[3] = t[4][t[4].length - 1][1];
|
||||
if (anno[t[0]] == null) anno[t[0]] = [];
|
||||
var s = t[4];
|
||||
for (var i = 0; i < s.length - 1; ++i) {
|
||||
if (s[i][1] >= s[i+1][0]) throw Error("ERROR: wrong annotation!");
|
||||
anno[t[0]].push([s[i][1], s[i+1][0]]);
|
||||
}
|
||||
}
|
||||
tr = null;
|
||||
|
||||
for (var chr in anno) {
|
||||
var e = anno[chr];
|
||||
if (e.length == 0) continue;
|
||||
Interval.sort(e);
|
||||
var k = 0;
|
||||
for (var i = 1; i < e.length; ++i) // dedup
|
||||
if (e[i][0] != e[k][0] || e[i][1] != e[k][1])
|
||||
e[++k] = e[i].slice(0);
|
||||
e.length = k + 1;
|
||||
Interval.index_end(e);
|
||||
}
|
||||
|
||||
var n_pri = 0, n_unmapped = 0, n_mapped = 0;
|
||||
var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0;
|
||||
|
||||
file = new File(arguments[getopt.ind+1]);
|
||||
var last_qname = null;
|
||||
var re_cigar = /(\d+)([MIDNSHX=])/g;
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, t = buf.toString().split("\t");
|
||||
|
||||
if (t[0].charAt(0) == '@') continue;
|
||||
var flag = parseInt(t[1]);
|
||||
if (flag&0x100) continue;
|
||||
if (first_only && last_qname == t[0]) continue;
|
||||
if (t[2] == '*') {
|
||||
++n_unmapped;
|
||||
continue;
|
||||
} else {
|
||||
++n_pri;
|
||||
if (last_qname != t[0]) ++n_mapped;
|
||||
}
|
||||
|
||||
var pos = parseInt(t[3]) - 1, intron = [];
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var len = parseInt(m[1]), op = m[2];
|
||||
if (op == 'N') {
|
||||
intron.push([pos, pos + len]);
|
||||
pos += len;
|
||||
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
|
||||
}
|
||||
if (intron.length == 0) {
|
||||
++n_sgl;
|
||||
continue;
|
||||
}
|
||||
n_splice += intron.length;
|
||||
|
||||
var chr = anno[t[2]];
|
||||
if (chr != null) {
|
||||
for (var i = 0; i < intron.length; ++i) {
|
||||
var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]);
|
||||
if (o.length > 0) {
|
||||
var hit = false;
|
||||
for (var j = 0; j < o.length; ++j) {
|
||||
var st_diff = intron[i][0] - o[j][0];
|
||||
var en_diff = intron[i][1] - o[j][1];
|
||||
if (st_diff < 0) st_diff = -st_diff;
|
||||
if (en_diff < 0) en_diff = -en_diff;
|
||||
if (st_diff <= l_fuzzy && en_diff <= l_fuzzy)
|
||||
++n_splice_hit, hit = true;
|
||||
if (hit) break;
|
||||
}
|
||||
if (print_ovlp) {
|
||||
var type = hit? 'C' : 'P';
|
||||
if (hit && print_err_only) continue;
|
||||
var x = '[';
|
||||
for (var j = 0; j < o.length; ++j) {
|
||||
if (j) x += ', ';
|
||||
x += '(' + o[j][0] + "," + o[j][1] + ')';
|
||||
}
|
||||
x += ']';
|
||||
print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x);
|
||||
}
|
||||
} else {
|
||||
++n_splice_novel;
|
||||
if (print_ovlp)
|
||||
print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
n_splice_novel += intron.length;
|
||||
}
|
||||
last_qname = t[0];
|
||||
}
|
||||
file.close();
|
||||
|
||||
buf.destroy();
|
||||
|
||||
if (!print_ovlp) {
|
||||
print("# unmapped reads: " + n_unmapped);
|
||||
print("# mapped reads: " + n_mapped);
|
||||
print("# primary alignments: " + n_pri);
|
||||
print("# singletons: " + n_sgl);
|
||||
print("# predicted introns: " + n_splice);
|
||||
print("# non-overlapping introns: " + n_splice_novel);
|
||||
print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)");
|
||||
}
|
||||
|
|
@ -94,7 +94,7 @@ while (file.readline(buf) >= 0) {
|
|||
ori_qlen = parseInt(t[1]);
|
||||
} else { // SAM
|
||||
var flag = parseInt(t[1]);
|
||||
if (flag & 4) continue;
|
||||
if ((flag & 4) || t[2] == '*' || t[5] == '*') continue;
|
||||
if (flag & 0x100) {
|
||||
++n_2nd;
|
||||
continue;
|
||||
|
|
|
|||
4
mmpriv.h
4
mmpriv.h
|
|
@ -40,9 +40,11 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
|||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx);
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
|
||||
|
||||
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
>q2
|
||||
GGACATCCCGATGGTGCAGTCCTACCTGTACGAAAGGAC
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
>t2
|
||||
GGACATCCCGATGGTGCAGgtGCTATTAAAGGTTCGTTTGTTCAACGATTAAagTCCTACCTGTACGAAAGGAC
|
||||
|
|
@ -179,3 +179,83 @@
|
|||
note = {doi:10.1101/130633},
|
||||
publisher = {Cold Spring Harbor Labs Journals},
|
||||
journal = {bioRxiv}}
|
||||
|
||||
@article{Gotoh:1982aa,
|
||||
Author = {Gotoh, O},
|
||||
Journal = {J Mol Biol},
|
||||
Pages = {705-8},
|
||||
Title = {An improved algorithm for matching biological sequences},
|
||||
Volume = {162},
|
||||
Year = {1982}}
|
||||
|
||||
@article{Altschul:1986aa,
|
||||
Author = {Altschul, S F and Erickson, B W},
|
||||
Journal = {Bull Math Biol},
|
||||
Pages = {603-16},
|
||||
Title = {Optimal sequence alignment using affine gap costs},
|
||||
Volume = {48},
|
||||
Year = {1986}}
|
||||
|
||||
@article{Wu:2005vn,
|
||||
Author = {Wu, Thomas D and Watanabe, Colin K},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {1859-75},
|
||||
Title = {{GMAP}: a genomic mapping and alignment program for {mRNA} and {EST} sequences},
|
||||
Volume = {21},
|
||||
Year = {2005}}
|
||||
|
||||
@article{Iwata:2012aa,
|
||||
Author = {Iwata, Hiroaki and Gotoh, Osamu},
|
||||
Journal = {Nucleic Acids Res},
|
||||
Pages = {e161},
|
||||
Title = {Benchmarking spliced alignment programs including {Spaln2}, an extended version of {Spaln} that incorporates additional species-specific features},
|
||||
Volume = {40},
|
||||
Year = {2012}}
|
||||
|
||||
@article{Dobin:2013kx,
|
||||
Author = {Dobin, Alexander and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {15-21},
|
||||
Title = {{STAR}: ultrafast universal {RNA-seq} aligner},
|
||||
Volume = {29},
|
||||
Year = {2013}}
|
||||
|
||||
@article{Byrne:2017aa,
|
||||
Author = {Byrne, Ashley and others},
|
||||
Journal = {Nat Commun},
|
||||
Pages = {16027},
|
||||
Title = {Nanopore long-read {RNAseq} reveals widespread transcriptional variation among the surface receptors of individual {B} cells},
|
||||
Volume = {8},
|
||||
Year = {2017}}
|
||||
|
||||
@article{Roberts:2004fv,
|
||||
Author = {Roberts, Michael and others},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {3363-9},
|
||||
Title = {Reducing storage requirements for biological sequence comparison},
|
||||
Volume = {20},
|
||||
Year = {2004}}
|
||||
|
||||
@article{Zhang:2006aa,
|
||||
Author = {Zhang, Miao and Gish, Warren},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {13-20},
|
||||
Title = {Improved spliced alignment from an information theoretic approach},
|
||||
Volume = {22},
|
||||
Year = {2006}}
|
||||
|
||||
@article{Li:2007aa,
|
||||
Author = {Li, Heng and others},
|
||||
Journal = {BMC Bioinformatics},
|
||||
Pages = {349},
|
||||
Title = {A cross-species alignment tool {(CAT)}},
|
||||
Volume = {8},
|
||||
Year = {2007}}
|
||||
|
||||
@article{Farrar:2007hs,
|
||||
Author = {Farrar, Michael},
|
||||
Journal = {Bioinformatics},
|
||||
Pages = {156-61},
|
||||
Title = {{Striped Smith-Waterman speeds database searches six times over other SIMD implementations}},
|
||||
Volume = {23},
|
||||
Year = {2007}}
|
||||
|
|
|
|||
517
tex/minimap2.tex
517
tex/minimap2.tex
|
|
@ -20,7 +20,7 @@
|
|||
\begin{document}
|
||||
\firstpage{1}
|
||||
|
||||
\title[Long DNA sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long DNA sequences}
|
||||
\title[Aligning long nucleotide sequences with minimap2]{Minimap2: fast pairwise alignment for long nucleotide sequences}
|
||||
\author[Li]{Heng Li}
|
||||
\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA}
|
||||
|
||||
|
|
@ -28,11 +28,14 @@
|
|||
|
||||
\begin{abstract}
|
||||
\section{Summary:} Minimap2 is a general-purpose mapper to align long noisy DNA
|
||||
sequences against a large reference database. It targets query sequences of
|
||||
1kb--100Mb in length with per-base divergence typically below 25\%. Minimap2 is
|
||||
$\sim$30 times faster than many mainstream long-read aligners and achieves
|
||||
higher accuracy on simulated data. It also employs concave gap cost and rescues
|
||||
inversions for improved alignment around potential structural variations.
|
||||
or mRNA sequences against a large reference database. It targets query
|
||||
sequences of 1kb--100Mb in length with per-base divergence typically below
|
||||
25\%. For DNA sequence reads, minimap2 is $\sim$30 times faster than many
|
||||
mainstream long-read aligners and achieves higher accuracy on simulated data.
|
||||
It also employs concave gap cost and rescues inversions for improved alignment
|
||||
around potential structural variations. For real long RNA-seq reads, minimap2
|
||||
is $\sim$40 times faster than peers and produces alignment more consistent with
|
||||
existing gene annotations.
|
||||
|
||||
\section{Availability and implementation:}
|
||||
\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2}
|
||||
|
|
@ -46,43 +49,62 @@ Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore
|
|||
technologies (ONT) produce reads over 10kbp in length at an error rate
|
||||
$\sim$15\%. Several aligners have been developed for such
|
||||
data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}.
|
||||
They are usually five times as slow as mainstream short-read
|
||||
aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be
|
||||
substantial room for speedup on the thought that 10kb long sequences should be
|
||||
easier to map than 100bp reads because we can more effectively skip repetitive
|
||||
regions, which are often the bottleneck of short-read alignment. We confirmed
|
||||
our speculation by achieving approximate mapping 50 times faster than
|
||||
BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast
|
||||
and novel algorithm on generating detailed alignment, which in turn inspired us
|
||||
to develop minimap2 towards higher accuracy and more practical functionality.
|
||||
Most of them were five times as slow as mainstream short-read
|
||||
aligners~\citep{Langmead:2012fk,Li:2013aa} in terms of the number of bases
|
||||
mapped per second. We speculated there could be substantial room for speedup on
|
||||
the thought that 10kb long sequences should be easier to map than 100bp reads
|
||||
because we can more effectively skip repetitive regions, which are often the
|
||||
bottleneck of short-read alignment. We confirmed our speculation by achieving
|
||||
approximate mapping 50 times faster than BWA-MEM~\citep{Li:2016aa}.
|
||||
\citet{Suzuki:2016} extended our work with a fast and novel algorithm on
|
||||
generating base-level alignment, which in turn inspired us to develop minimap2
|
||||
towards higher accuracy and more practical functionality.
|
||||
|
||||
Both SMRT and ONT have been applied to sequence spliced mRNAs (RNA-seq). While
|
||||
traditional mRNA aligners work~\citep{Wu:2005vn,Iwata:2012aa}, they are not
|
||||
optimized for long noisy sequence reads and are tens of times slower than
|
||||
dedicated long-read aligners. When developing minimap2 initially for aligning
|
||||
genomic DNA only, we realized minor modifications could make it competitive for
|
||||
aligning mRNAs as well. Minimap2 is a first RNA-seq aligner specifically
|
||||
designed for long noisy reads.
|
||||
|
||||
\begin{methods}
|
||||
\section{Methods}
|
||||
|
||||
Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar
|
||||
indexing and seeding algorithms, and furthers it with more accurate chaining
|
||||
and the ability to produce detailed alignment.
|
||||
Minimap2 follows a typical seed-chain-align procedure as is used by most
|
||||
full-genome aligners. It collects minimizers~\citep{Roberts:2004fv} of the
|
||||
reference sequences and indexes them in a hash table. Then for each query
|
||||
sequence, minimap2 takes query minimizers as \emph{seeds}, finds matches to the
|
||||
reference, and identifies sets of colinear seeds, which are called
|
||||
\emph{chains}. If base-level alignment is requested, minimap2 applies dynamic
|
||||
programming (DP) to extend from the ends of chains and to close unseeded
|
||||
regions between adjacent seeds in chains.
|
||||
|
||||
Minimap2 uses indexing and seeding algorithms similar to
|
||||
minimap~\citep{Li:2016aa}, and furthers the predecessor with more accurate
|
||||
chaining, the ability to produce base-level alignment and the support of
|
||||
spliced alignment.
|
||||
|
||||
\subsection{Chaining}
|
||||
|
||||
%\subsubsection{Chaining}
|
||||
\subsubsection{Chaining}
|
||||
An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the
|
||||
reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors
|
||||
sorted by ending reference position $x$, let $f(i)$ be the maximal chaining
|
||||
score up to the $i$-th anchor in the list. $f(i)$ can be calculated with
|
||||
dynamic programming (DP):
|
||||
dynamic programming:
|
||||
\begin{equation}\label{eq:chain}
|
||||
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\}
|
||||
f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+\alpha(j,i)-\beta(j,i) \},w_i\big\}
|
||||
\end{equation}
|
||||
where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
|
||||
matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It
|
||||
where $\alpha(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of
|
||||
matching bases between the two anchors. $\beta(j,i)>0$ is the gap cost. It
|
||||
equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the
|
||||
distance between two anchors is too large); otherwise
|
||||
\[
|
||||
\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\})
|
||||
\]
|
||||
In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot
|
||||
l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with
|
||||
\begin{equation}\label{eq:chain-gap}
|
||||
\beta(j,i)=\gamma_c\big((y_i-y_j)-(x_i-x_j)\big)
|
||||
\end{equation}
|
||||
In implementation, a gap of length $l$ costs $\gamma_c(l)=0.01\cdot \bar{w}\cdot
|
||||
|l|+0.5\log_2|l|$, where $\bar{w}$ is the average seed length. For $m$ anchors, directly computing all $f(\cdot)$ with
|
||||
Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster
|
||||
chaining algorithms exist~\citep{Abouelhoda:2005aa}, they
|
||||
are inapplicable to generic gap cost, complex to implement and usually
|
||||
|
|
@ -91,20 +113,20 @@ accelerate chaining.
|
|||
|
||||
We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor
|
||||
of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}),
|
||||
we start from anchor $i-1$ and stop the evaluation if we cannot find a better
|
||||
we start from anchor $i-1$ and stop the process if we cannot find a better
|
||||
score after up to $h$ iterations. This approach reduces the average time to
|
||||
$O(h\cdot m)$. In practice, we can almost always find the optimal chain with
|
||||
$h=50$; even if the heuristic fails, the optimal chain is often close.
|
||||
|
||||
%\subsubsection{Backtracking}
|
||||
For backtracking, let $P(i)$ be the index of the best predecessor of anchor
|
||||
$i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$
|
||||
otherwise. For each anchor $i$ in the descending order of $f(i)$, we apply
|
||||
$P(\cdot)$ repeatedly to find its predecessor and mark each visited $i$ as
|
||||
`used', until $P(i)=0$ or we reach an already `used' $i$. This way we find all
|
||||
chains with no anchors used in more than one chains.
|
||||
\subsubsection{Backtracking}
|
||||
Let $P(i)$ be the index of the best predecessor of anchor $i$. It equals 0 if
|
||||
$f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ otherwise. For each
|
||||
anchor $i$ in the descending order of $f(i)$, we apply $P(\cdot)$ repeatedly to
|
||||
find its predecessor and mark each visited $i$ as `used', until $P(i)=0$ or we
|
||||
reach an already `used' $i$. This way we find all chains with no anchors used
|
||||
in more than one chains.
|
||||
|
||||
%\subsubsection{Identifying primary chains}
|
||||
\subsubsection{Identifying primary chains}
|
||||
In the absence of copy number changes, each query segment should not be mapped
|
||||
to two places in the reference. However, chains found at the previous step may
|
||||
have significant or complete overlaps due to repeats in the reference.
|
||||
|
|
@ -117,132 +139,22 @@ add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did
|
|||
not choose a more sophisticated data structure (e.g. range tree or k-d tree)
|
||||
because this step is not the performance bottleneck.
|
||||
|
||||
\subsection{Alignment}
|
||||
\subsection{Aligning genomic DNA}
|
||||
|
||||
Minimap2 performs global alignment between adjacent anchors in a chain. It
|
||||
adopted difference-based formulation to derive
|
||||
alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization,
|
||||
this formulation has two advantages. First, because each score in the DP matrix
|
||||
is bounded by the gap cost and the maximum matching score, we can usually
|
||||
achieve 16-way SSE vectorization regardless of the peak score of the
|
||||
alignment. Second, filling the DP matrix along the diagonal, we can simplify
|
||||
banded alignment, which is critical to performance. In practice, our
|
||||
implementation is three times as fast as Parasail's 4-way
|
||||
vectorization~\citep{Daily:2016aa} for global alignment.
|
||||
Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa},
|
||||
but with a 1000bp band, it is considerably faster. When performing global
|
||||
alignment between anchors, we expect the alignment to stay close to the
|
||||
diagonal of the DP matrix. Banding is often applicable.
|
||||
\subsubsection{Alignment with 2-piece affine gap cost}
|
||||
|
||||
Minimap2 uses a 2-piece affine gap cost
|
||||
$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$.
|
||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this
|
||||
cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than
|
||||
$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
|
||||
$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
|
||||
longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}).
|
||||
Minimap2 performs DP-based global alignment between adjacent anchors in a
|
||||
chain. It uses a 2-piece affine gap cost~\citep{Gotoh:1990aa}:
|
||||
\begin{equation}\label{eq:2-piece}
|
||||
\gamma_a(l)=\min\{q+|l|\cdot e,\tilde{q}+|l|\cdot\tilde{e}\}
|
||||
\end{equation}
|
||||
Without losing generality, we always assume $q+e<\tilde{q}+\tilde{e}$.
|
||||
On the condition that $e>\tilde{e}$, it applies cost $q+|l|\cdot e$ to gaps
|
||||
shorter than $\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies
|
||||
$\tilde{q}+|l|\cdot\tilde{e}$ to longer gaps. This scheme helps to recover
|
||||
longer insertions and deletions~(INDELs).
|
||||
|
||||
With global alignment, minimap2 may force to align unrelated sequences between
|
||||
two adjacent anchors. To avoid such an artifact, we compute accumulative
|
||||
alignment score along the alignment path and break the alignment where the
|
||||
score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
|
||||
the alignment score along the alignment path ending at cell $(i,j)$ in the DP
|
||||
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
|
||||
$j'<j$, such that
|
||||
\[
|
||||
S(i',j')-S(i,j)>Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\})
|
||||
\]
|
||||
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
|
||||
This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}.
|
||||
However, unlike X-drop, it would not break the alignment in the presence of a
|
||||
single long gap.
|
||||
|
||||
When minimap2 breaks a global alignment between two anchors, it performs local
|
||||
alignment between the two subsequences involved in the global alignment, but
|
||||
this time with the one subsequence reverse complemented. This additional
|
||||
alignment step may identify short inversions that are missed during chaining.
|
||||
|
||||
\end{methods}
|
||||
|
||||
\section{Results}
|
||||
\begin{figure}[!tb]
|
||||
\centering
|
||||
\includegraphics[width=.5\textwidth]{roc-color.pdf}
|
||||
\caption{Evaluation on simulated SMRT reads aligned against human genome
|
||||
GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function
|
||||
of mapping quality. 33,088 $\ge$1000bp reads were simulated using
|
||||
pbsim~\citep{Ono:2013aa} with error profile sampled from file
|
||||
`m131017\_060208\_42213\_*.1.*' downloaded at
|
||||
\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is
|
||||
11,628. A read is considered correctly mapped if the true position overlaps
|
||||
with the best mapping position by 10\% of the read length. All aligners were
|
||||
run under the default setting for SMRT reads.}\label{fig:eval}
|
||||
\end{figure}
|
||||
|
||||
As a sanity check, we evaluated minimap2 on simulated human reads along with
|
||||
BLASR~\citep{Chaisson:2012aa},
|
||||
BWA-MEM~\citep{Li:2013aa},
|
||||
GraphMap~\citep{Sovic:2016aa},
|
||||
minialign~\citep{Suzuki:2016} and
|
||||
NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab},
|
||||
LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either
|
||||
crashed or produced malformatted output. In this evaluation, Minimap2 has
|
||||
higher power to distinguish unique and repetitive hits, and achieves overall
|
||||
higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
|
||||
even if we skip DP-based alignment (data not shown), suggesting chaining alone
|
||||
is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
|
||||
NGMLR provide better mapping quality estimate: they rarely give repetitive hits
|
||||
high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may
|
||||
occasionally miss close suboptimal hits and be overconfident in wrong mappings.
|
||||
On run time, minialign is slightly faster than minimap2. They are over 30 times
|
||||
faster than the rest. Minimap2 consumed 6.1GB memory at the peak, more than
|
||||
BWA-MEM but less than others.
|
||||
|
||||
On real human SMRT reads, the relative performance and sensitivity of
|
||||
these aligners are broadly similar to those on simulated data. We are unable to
|
||||
provide a good estimate of mapping error rate due to the lack of the truth. On
|
||||
ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and
|
||||
minimap2 are over 70 times faster than others. We have also examined tens of
|
||||
$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the
|
||||
observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into
|
||||
shorter gaps. Minimap2 does not have this issue.
|
||||
|
||||
\section{Discussions}
|
||||
|
||||
Minialign and minimap2 are fast because a) with chaining, they can quickly
|
||||
filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but
|
||||
costly DP-based alignments; b) they implemented so far the fastest DP-based
|
||||
alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to
|
||||
further accelerate minimap2 with a few other tweaks such as adaptive
|
||||
banding~\citep{Suzuki130633} or incremental banding.
|
||||
|
||||
In addition to reference-based read mapping, minimap2 inherits minimap's
|
||||
ability to search against huge multi-species databases and to find read
|
||||
overlaps. On a few test data sets, minimap2 appears to yield slightly better
|
||||
miniasm assembly. Minimap2 can also align closely related genomes, though it
|
||||
would benefit from more thorough evaluations. Genome alignment is an intricate
|
||||
topic.
|
||||
|
||||
\section*{Acknowledgements}
|
||||
We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
|
||||
insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
|
||||
and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also
|
||||
grateful to early minimap2 testers who have greatly helped to fix various
|
||||
issues.
|
||||
|
||||
\bibliography{minimap2}
|
||||
|
||||
\pagebreak
|
||||
\appendix
|
||||
|
||||
\begin{methods}
|
||||
\section*{Appendix}
|
||||
A 2-piece gap cost function is
|
||||
\[
|
||||
\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}
|
||||
\]
|
||||
Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation
|
||||
to compute the optimal alignment under such a gap cost is~\citep{Gotoh:1990aa}
|
||||
The equation to compute the optimal alignment under $\gamma_a(\cdot)$ is
|
||||
\begin{equation}\label{eq:ae86}
|
||||
\left\{\begin{array}{l}
|
||||
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\
|
||||
|
|
@ -253,7 +165,20 @@ F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
|||
\end{array}\right.
|
||||
\end{equation}
|
||||
where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query
|
||||
base. If we define
|
||||
base. Eq.~(\ref{eq:ae86}) is a natural extension to the equation under affine
|
||||
gap cost~\citep{Gotoh:1982aa,Altschul:1986aa}.
|
||||
|
||||
\subsubsection{Suzuki's formulation}
|
||||
|
||||
When we allow gaps longer than several hundred base pairs, nucleotide-level
|
||||
alignment is much slower than chaining. SSE acceleration is critical to the
|
||||
performance of minimap2. Traditional SSE implementations~\citep{Farrar:2007hs}
|
||||
based on Eq.~(\ref{eq:ae86}) can achieve 16-way parallelization for short
|
||||
sequences, but only 4-way parallelization when the peak alignment score reaches
|
||||
32767. Long sequence alignment may exceed this threshold. Inspired by
|
||||
\citet{Wu:1996aa} and the following work, \citet{Suzuki:2016} proposed a
|
||||
difference-based formulation that lifted this limitation.
|
||||
In case of 2-piece gap cost, define
|
||||
\[
|
||||
\left\{\begin{array}{ll}
|
||||
u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\
|
||||
|
|
@ -261,7 +186,7 @@ x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\
|
|||
y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij}
|
||||
\end{array}\right.
|
||||
\]
|
||||
we can transform Eq.~(\ref{eq:ae86}) to
|
||||
We can transform Eq.~(\ref{eq:ae86}) to
|
||||
\begin{equation}\label{eq:suzuki}
|
||||
\left\{\begin{array}{lll}
|
||||
z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\
|
||||
|
|
@ -276,7 +201,8 @@ y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\
|
|||
\end{equation}
|
||||
where $z_{ij}$ is a temporary variable that does not need to be stored.
|
||||
|
||||
All values in Eq.~(\ref{eq:suzuki}) are bounded. To see that,
|
||||
An important property of Eq.~(\ref{eq:suzuki}) is that all values are bounded
|
||||
by scoring parameters. To see that,
|
||||
\[
|
||||
x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e
|
||||
\]
|
||||
|
|
@ -294,19 +220,19 @@ matching score, we can derive
|
|||
\[
|
||||
u_{ij}\le M-v_{i-1,j}\le M+q+e
|
||||
\]
|
||||
In conclusion, $x$ and $y$ are bounded by $[-q-e,-e]$, $\tilde{x}$ and $\tilde{y}$ by
|
||||
$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When
|
||||
matching score and gap cost are small, each of them can be stored as a 8-bit
|
||||
integer. This enables 16-way SSE vectorization regardless of the peak score of
|
||||
the alignment.
|
||||
In conclusion, in Eq.~(\ref{eq:suzuki}), $x$ and $y$ are bounded by $[-q-e,-e]$,
|
||||
$\tilde{x}$ and $\tilde{y}$ by $[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and
|
||||
$v$ by $[-q-e,M+q+e]$. When $-128\le-q-e<M+q+e\le127$, each of them can be stored as
|
||||
a 8-bit integer. This enables 16-way SSE vectorization regardless of the peak
|
||||
score of the alignment.
|
||||
|
||||
For a more efficient SSE implementation, we transform the row-column coordinate
|
||||
to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
|
||||
to the diagonal-antidiagonal coordinate by letting $r\gets i+j$ and $t\gets i$.
|
||||
Eq.~(\ref{eq:suzuki}) becomes:
|
||||
\begin{equation*}
|
||||
\left\{\begin{array}{lll}
|
||||
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\
|
||||
&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
|
||||
z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}\\
|
||||
&&+u_{r-1,t},\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\
|
||||
u_{rt}&=&z_{rt}-v_{r-1,t-1}\\
|
||||
v_{rt}&=&z_{rt}-u_{r-1,t}\\
|
||||
x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\
|
||||
|
|
@ -317,10 +243,11 @@ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\
|
|||
\end{equation*}
|
||||
In this formulation, cells with the same diagonal index $r$ are independent of
|
||||
each other. This allows us to fully vectorize the computation of all cells on
|
||||
the same anti-diagonal in one inner loop.
|
||||
the same anti-diagonal in one inner loop. It also simplifies banded alignment,
|
||||
which would be difficult with striped vectorization~\citep{Farrar:2007hs}.
|
||||
|
||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary
|
||||
condition of this equation in the diagonal-anti-diagonal coordinate is
|
||||
On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the initial
|
||||
values in the diagonal-antidiagonal formuation is
|
||||
\[
|
||||
\left\{\begin{array}{l}
|
||||
x_{r-1,-1}=y_{r-1,r}=-q-e\\
|
||||
|
|
@ -337,8 +264,246 @@ r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\til
|
|||
-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)
|
||||
\end{array}\right.
|
||||
\]
|
||||
These can be derived from the initial values for Eq.~(\ref{eq:ae86}).
|
||||
|
||||
In practice, our 16-way vectorized implementation of global alignment is three
|
||||
times as fast as Parasail's 4-way vectorization~\citep{Daily:2016aa}. Without
|
||||
banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, but with
|
||||
a 1000bp band, it is considerably faster. When performing global alignment
|
||||
between anchors, we expect the alignment to stay close to the diagonal of the
|
||||
DP matrix. Banding is applicable most of time.
|
||||
|
||||
\subsubsection{The Z-drop heuristic}
|
||||
|
||||
With global alignment, minimap2 may force to align unrelated sequences between
|
||||
two adjacent anchors. To avoid such an artifact, we compute accumulative
|
||||
alignment score along the alignment path and break the alignment where the
|
||||
score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be
|
||||
the alignment score along the alignment path ending at cell $(i,j)$ in the DP
|
||||
matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'<i$ and
|
||||
$j'<j$, such that
|
||||
\[
|
||||
S(i',j')-S(i,j)>Z+e\cdot|(i-i')-(j-j')|
|
||||
\]
|
||||
where $e$ is the gap extension cost and $Z$ is an arbitrary threshold.
|
||||
This strategy is first used in BWA-MEM. It is similar to X-drop employed in
|
||||
BLAST~\citep{Altschul:1997vn}, but unlike X-drop, it would not break the
|
||||
alignment in the presence of a single long gap.
|
||||
|
||||
When minimap2 breaks a global alignment between two anchors, it performs local
|
||||
alignment between the two subsequences involved in the global alignment, but
|
||||
this time with the one subsequence reverse complemented. This additional
|
||||
alignment step may identify short inversions that are missed during chaining.
|
||||
|
||||
\subsection{Aligning spliced sequences}
|
||||
|
||||
The algorithm described above can be adapted to spliced alignment. In this
|
||||
mode, the chaining gap cost distinguishes insertions to and deletions from the
|
||||
reference: $\gamma_c(l)$ in Eq.~(\ref{eq:chain-gap}) takes the form of
|
||||
\[
|
||||
\gamma_c(l)=\left\{\begin{array}{ll}
|
||||
0.01\cdot\bar{w}\cdot l+0.5\log_2 l & (l>0) \\
|
||||
\min\{0.01\cdot\bar{w}\cdot|l|,\log_2|l|\} & (l<0)
|
||||
\end{array}\right.
|
||||
\]
|
||||
Similarly, the gap cost function used for DP-based alignment is changed to
|
||||
\[
|
||||
\gamma_a(l)=\left\{\begin{array}{ll}
|
||||
q+l\cdot e & (l>0) \\
|
||||
\min\{q+|l|\cdot e,\tilde{q}\} & (l<0)
|
||||
\end{array}\right.
|
||||
\]
|
||||
In alignment, a deletion no shorter than $\lceil(\tilde{q}-q)/e\rceil$ is
|
||||
regarded as an intron, which pays no cost to gap extensions.
|
||||
|
||||
To pinpoint precise splicing junctions, minimap2 introduces reference-dependent
|
||||
cost to penalize non-canonical splicing:
|
||||
\begin{equation}\label{eq:splice}
|
||||
\left\{\begin{array}{l}
|
||||
H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij}-a(i)\}\\
|
||||
E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\
|
||||
F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\
|
||||
\tilde{E}_{i+1,j}= \max\{H_{ij}-d(i)-\tilde{q},\tilde{E}_{ij}\}\\
|
||||
\end{array}\right.
|
||||
\end{equation}
|
||||
Let $T$ be the reference sequence. $d(i)$ is the cost of a non-canonical donor
|
||||
site, which takes 0 if $T[i+1,i+2]={\tt GT}$, or a positive number $p$
|
||||
otherwise. Similarly, $a(i)$ is the cost of a non-canonical acceptor site, which
|
||||
takes 0 if $T[i-1,i]={\tt AG}$, or $p$ otherwise. Eq.~(\ref{eq:splice}) is
|
||||
almost equivalent to the equation used by EXALIN~\citep{Zhang:2006aa} except
|
||||
that we allow insertions immediately followed by deletions and vice versa; in
|
||||
addition, we use Suzuki's diagonal formulation in actual implementation.
|
||||
|
||||
%Given that $d_i$ and $a_i$
|
||||
%are a function of the reference sequence, it is possible to incorporate
|
||||
%splicing signals with more sophisticated models, such as positional weight
|
||||
%matrices. We have not tried this approach.
|
||||
|
||||
If RNA-seq reads are not sequenced from stranded libraries, the read strand
|
||||
relative to the underlying transcript is unknown. By default, minimap2 aligns
|
||||
each chain twice, first assuming ${\tt GT}$--${\tt AG}$ as the splicing signal
|
||||
and then assuming ${\tt CT}$--${\tt AC}$, the reverse complement of ${\tt
|
||||
GT}$--${\tt AG}$, as the splicing signal. The alignment with a higher score is
|
||||
taken as the final alignment. This procedure also infers the relative strand of
|
||||
reads that span canonical splicing sites.
|
||||
|
||||
In the spliced alignment mode, minimap2 further increases the density of
|
||||
minimizers and disables banded alignment. Together with the two-round DP-based
|
||||
alignment, spliced alignment is several times slower than DNA sequence
|
||||
alignment.
|
||||
|
||||
\citet{Suzuki:2016} first derived a similar set of equations under affine gap
|
||||
cost but with different notations.
|
||||
\end{methods}
|
||||
|
||||
\section{Results}
|
||||
|
||||
\subsection{Aligning genomic reads}
|
||||
|
||||
\begin{figure}[!tb]
|
||||
\centering
|
||||
\includegraphics[width=.5\textwidth]{roc-color.pdf}
|
||||
\caption{Evaluation on simulated SMRT reads aligned against human genome
|
||||
GRCh38. 33,088 $\ge$1000bp reads were simulated using pbsim~\citep{Ono:2013aa}
|
||||
with error profile sampled from file `m131017\_060208\_42213\_*.1.*' downloaded
|
||||
at \href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length
|
||||
is 11,628. A read is considered correctly mapped if the true position overlaps
|
||||
with the best mapping position by 10\% of the read length. All aligners were
|
||||
run under the default setting for SMRT reads. (a) ROC-like curve. Alignments
|
||||
are sorted by mapping quality in the descending order. For each mapping quality
|
||||
threshold, the fraction of alignments with mapping quality above the threshold
|
||||
and their error rate are plotted. Kart outputted all alignments at mapping
|
||||
quality 60, so is not shown in the figure. It mapped nearly all reads with
|
||||
4.1\% of alignments being wrong, less accurate than others. (b) Accumulative
|
||||
mapping error rate as a function of mapping quality.}\label{fig:eval}
|
||||
\end{figure}
|
||||
|
||||
As a sanity check, we evaluated minimap2 on simulated human reads along with
|
||||
BLASR~(v1.MC.rc64; \citealp{Chaisson:2012aa}),
|
||||
BWA-MEM~(v0.7.15; \citealp{Li:2013aa}),
|
||||
GraphMap~(v0.5.2; \citealp{Sovic:2016aa}),
|
||||
Kart~(v2.2.5; \citealp{Lin:2017aa}),
|
||||
minialign~(v0.5.3; \citealp{Suzuki:2016}) and
|
||||
NGMLR~(v0.2.5; \citealp{Sedlazeck169557}). We excluded rHAT~\citep{Liu:2016ab}
|
||||
and LAMSA~\citep{Liu:2017aa} because they either
|
||||
crashed or produced malformatted output. In this evaluation, minimap2 has
|
||||
higher power to distinguish unique and repetitive hits, and achieves overall
|
||||
higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate
|
||||
even if we skip DP-based alignment (data not shown), confirming chaining alone
|
||||
is sufficient to achieve high accuracy for approximate mapping. Minimap2 and
|
||||
NGMLR provide better mapping quality estimate: they rarely give repetitive hits
|
||||
high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may
|
||||
occasionally miss close suboptimal hits and be overconfident in wrong mappings.
|
||||
On run time, minialign is slightly faster than minimap2 and Kart. They are over
|
||||
30 times faster than the rest. Minimap2 consumed 6.1GB memory at the peak,
|
||||
more than BWA-MEM but less than others.
|
||||
|
||||
On real human SMRT reads, the relative performance and sensitivity of
|
||||
these aligners are broadly similar to the metrics on simulated data. We are
|
||||
unable to provide a good estimate of mapping error rate due to the lack of the
|
||||
truth. On ONT $\sim$100kb human reads~\citep{Jain128835}, BWA-MEM failed.
|
||||
Kart, minialign and minimap2 are over 70 times faster than others. We have also
|
||||
examined tens of $\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can
|
||||
confirm the observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks
|
||||
them into shorter gaps. The issue is much alleviated with minimap2, thanks
|
||||
to the 2-piece affine gap cost.
|
||||
|
||||
\subsection{Aligning spliced reads}
|
||||
|
||||
We evaluated minimap2 on SIRV control data~(AC:SRR5286959;
|
||||
\citealp{Byrne:2017aa}) where the truth is known. Minimap2 predicted 59\,916
|
||||
introns from 11\,017 reads. 93.0\% of splice juctions are precise. We examined
|
||||
wrongly predicted junctions and found the majority were caused by clustered
|
||||
splicing signals (e.g. two adjacent ${\tt GT}$ sites). When INDEL sequencing
|
||||
errors are frequent, it is difficult to find precise splicing sites in this
|
||||
case. If we allow up to 10bp distance from true splicing sites, 98.4\% of
|
||||
aligned introns are approximately correct. Given this observation, we might be
|
||||
able to improve boundary detection by initializing $d(\cdot)$ and $a(\cdot)$ in
|
||||
Eq.~(\ref{eq:splice}) with position-specific scoring matrices or more
|
||||
sophisticated models. We have not tried this approach.
|
||||
|
||||
\begin{table}[!tb]
|
||||
\processtable{Evaluation of junction accuracy on 2D ONT reads}
|
||||
{\footnotesize\label{tab:intron}
|
||||
\begin{tabular}{p{3.1cm}rrrr}
|
||||
\toprule
|
||||
& GMAP & minimap2 & SpAln & STAR\\
|
||||
\midrule
|
||||
Run time (CPU min) & 631 & 15.5 & 2\,076 & 33.9 \\
|
||||
Peak RAM (GByte) & 8.9 & 14.5 & 3.2 & 29.2\vspace{1em}\\
|
||||
\# aligned reads & 103\,669 & 103\,917 & 103\,711 & 26\,479\\
|
||||
\# chimeric alignments & 1\,904 & 1\,671 & 0 & 0\\
|
||||
\# non-spliced alignments & 15\,854 & 14\,483 & 17\,033 & 10\,545\vspace{1em}\\
|
||||
\# aligned introns & 692\,275 & 694\,237 & 692\,945 & 78\,603 \\
|
||||
\# novel introns & 11\,239 & 3\,217 & 8\,550 & 1\,214 \\
|
||||
\% exact introns & 83.8\% & 91.8\% & 87.9\% & 55.2\% \\
|
||||
\% approx. introns & 91.8\% & 96.5\% & 92.5\% & 82.4\% \\
|
||||
\botrule
|
||||
\end{tabular}
|
||||
}{Mouse reads (AC:SRR5286960) were mapped to the primary assembly of mouse
|
||||
genome GRCm38 with the following tools and command options: minimap2 (`-ax
|
||||
splice'); GMAP (`-n 0 --min-intronlength 30 --cross-species'); SpAln (`-Q7 -LS
|
||||
-S3'); STARlong (according to
|
||||
\href{http://bit.ly/star-pb}{http://bit.ly/star-pb}). The alignments were
|
||||
compared to the EnsEMBL gene annotation, release 89. A predicted intron
|
||||
is \emph{novel} if it has no overlaps with any annotated introns. An intron
|
||||
is \emph{exact} if it is identical to an annotated intron. An intron is
|
||||
\emph{approximate} if both its 5'- and 3'-end are within 10bp around the ends
|
||||
of an annotated intron.}
|
||||
\end{table}
|
||||
|
||||
We next aligned real mouse reads~\citep{Byrne:2017aa} with GMAP~(v2017-06-20;
|
||||
\citealp{Wu:2005vn}), minimap2, SpAln~(v2.3.1; \citealp{Iwata:2012aa}) and
|
||||
STAR~(v2.5.3a; \citealp{Dobin:2013kx}). In general, minimap2 is more
|
||||
consistent with existing annotations (Table~\ref{tab:intron}): it finds
|
||||
more junctions with a higher percentage being exactly or approximately correct.
|
||||
Minimap2 is over 40 times faster than GMAP and SpAln. While STAR is close to
|
||||
minimap2 in speed, it does not work well with noisy reads. We have also
|
||||
evaluated spliced aligners on public Iso-Seq data (human Alzheimer brain
|
||||
from \href{http://bit.ly/isoseqpub}{http://bit.ly/isoseqpub}). The observation
|
||||
is similar: minimap2 is faster at higher junction accuracy.
|
||||
|
||||
We noted that GMAP and SpAln have not been optimized for noisy reads. We are
|
||||
showing the best setting we have experimented, but their developers should be
|
||||
able to improve their accuracy further.
|
||||
|
||||
%\begin{table}[!tb]
|
||||
%\processtable{Evaluation of junction accuracy on SMRT Iso-Seq reads}
|
||||
%{\footnotesize
|
||||
%\begin{tabular}{lrrrr}
|
||||
%\toprule
|
||||
%& GMAP & minimap2 & SpAln & STAR\\
|
||||
%\midrule
|
||||
%Run time (CPU min) & & 243 & 2\,352 & 1\,647 \\
|
||||
%\# aligned reads & & 1\,123\,025 & 1\,094\,092 & 682\,452\\
|
||||
%\# chimeric alignments & & 33\,091 & 0 & 0\\
|
||||
%\# non-spliced alignments & & 339\,081 & 291\,447 & 272\,536\vspace{1em}\\
|
||||
%\# aligned introns & & 9\,071\,755 & 9\,208\,564 & 3\,029\,121 \\
|
||||
%\# novel introns & & 42\,773 & 82\,230 & 17\,791 \\
|
||||
%\% exact introns & & 94.9\% & 91.7\% & 84.7\% \\
|
||||
%\% approx. introns&& 96.9\% & 93.4\% & 93.8\% \\
|
||||
%\botrule
|
||||
%\end{tabular}
|
||||
%}{}
|
||||
%\end{table}
|
||||
|
||||
|
||||
\section{Conclusion}
|
||||
|
||||
Minimap2 is a fast, accurate and versatile aligner for long nucleotide
|
||||
sequences. In addition to reference-based read mapping, minimap2 inherits
|
||||
minimap's functionality to search against huge multi-species databases and to
|
||||
find read overlaps. On a few test data sets, minimap2 appears to yield slightly
|
||||
better miniasm assembly~\citep{Li:2016aa}. Minimap2 can also align similar
|
||||
genomes or different assemblies of the same species. However, full-genome
|
||||
alignment is an intricate research topic. More thorough evaluations would be
|
||||
necessary to justify the use of minimap2 for such applications.
|
||||
|
||||
\section*{Acknowledgements}
|
||||
We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and
|
||||
insightful notes before formal publication. We thank M. Schatz, P. Rescheneder
|
||||
and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also
|
||||
grateful to early minimap2 testers who have greatly helped to suggest features
|
||||
and to fix various issues.
|
||||
|
||||
\bibliography{minimap2}
|
||||
|
||||
\end{document}
|
||||
|
|
|
|||
Loading…
Reference in New Issue