diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..e386b27 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: c +compiler: + - gcc + - clang +script: make diff --git a/format.c b/format.c index a8d681e..fd51c00 100644 --- a/format.c +++ b/format.c @@ -1,7 +1,9 @@ #include #include #include +#include #include +#include "kalloc.h" #include "mmpriv.h" static inline void str_enlarge(kstring_t *s, int l) @@ -54,6 +56,63 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } +static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +{ + extern unsigned char seq_nt4_table[256]; + int i, q_off, t_off; + uint8_t *qseq, *tseq; + char *tmp; + if (r->p == 0) return; + mm_sprintf_lite(s, "\tcs:Z:"); + qseq = (uint8_t*)kmalloc(km, r->qe - r->qs); + tseq = (uint8_t*)kmalloc(km, r->re - r->rs); + tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1); + mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq); + if (!r->rev) { + for (i = r->qs; i < r->qe; ++i) + qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; + } else { + for (i = r->qs; i < r->qe; ++i) { + uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]]; + qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c; + } + } + for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { + int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; + assert(op >= 0 && op <= 2); + if (op == 0) { + int l_tmp = 0; + for (j = 0; j < len; ++j) { + if (qseq[q_off + j] != tseq[t_off + j]) { + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + l_tmp = 0; + } + mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]); + } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]]; + } + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + } + q_off += len, t_off += len; + } else if (op == 1) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "acgtn"[qseq[q_off + j]]; + mm_sprintf_lite(s, "+%s", tmp); + q_off += len; + } else if (op == 2) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "acgtn"[tseq[t_off + j]]; + mm_sprintf_lite(s, "-%s", tmp); + t_off += len; + } + } + assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); + kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); +} + static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; @@ -63,7 +122,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -74,12 +133,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen); mm_sprintf_lite(s, "\t%d", r->mapq); write_tags(s, r); - if (r->p) { + if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); } + if (r->p && (opt_flag & MM_F_OUT_CS)) + write_cs(km, s, mi, t, r); } static char comp_tab[] = { diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index e0f5337..c7f5843 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -1,5 +1,6 @@ #include #include +#include #include "ksw2.h" #ifdef __SSE2__ @@ -66,7 +67,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (w < 0) w = tlen > qlen? tlen : qlen; wl = wr = w; tlen_ = (tlen + 15) / 16; - n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; @@ -161,6 +163,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin x21_ = _mm_cvtsi32_si128((uint8_t)x21); v1_ = _mm_cvtsi32_si128((uint8_t)v1); st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); if (!with_cigar) { // score only for (t = st_; t <= en_; ++t) { __m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp; diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index d665c6f..04669b9 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -1,4 +1,5 @@ #include +#include #include "ksw2.h" #ifdef __SSE2__ @@ -58,7 +59,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin if (w < 0) w = tlen > qlen? tlen : qlen; wl = wr = w; tlen_ = (tlen + 15) / 16; - n_col_ = ((w + 1 < tlen? (w + 1 < qlen? w + 1 : qlen): tlen) + 15) / 16 + 1; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; qlen_ = (qlen + 15) / 16; for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { max_sc = max_sc > mat[t]? max_sc : mat[t]; @@ -130,6 +132,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin x1_ = _mm_cvtsi32_si128(x1); v1_ = _mm_cvtsi32_si128(v1); st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); if (!with_cigar) { // score only for (t = st_; t <= en_; ++t) { __m128i z, a, b, xt1, vt1, ut, tmp; diff --git a/main.c b/main.c index 9cc467d..54c49d8 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r275" +#define MM_VERSION "2.0-r276-dirty" void liftrlimit() { @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -67,7 +67,8 @@ int main(int argc, char *argv[]) else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); - else if (c == 'c') opt.flag |= MM_F_CIGAR; + else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR; + else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR; else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF; else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR; else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; @@ -166,6 +167,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); fprintf(stderr, " -c output CIGAR in PAF\n"); + fprintf(stderr, " -S output the cs tag in PAF\n"); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(stderr, " -K NUM minibatch size [200M]\n"); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); diff --git a/map.c b/map.c index bfb5d3d..4ea111e 100644 --- a/map.c +++ b/map.c @@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in) kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); return in; } else if (step == 2) { // step 2: output + void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); + if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); - else mm_write_paf(&p->str, mi, t, r); + else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { @@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in) if (s->seq[i].qual) free(s->seq[i].qual); } free(s->reg); free(s->n_reg); free(s->seq); + km_destroy(km); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq); free(s); diff --git a/minimap.h b/minimap.h index 27eb3b4..076017f 100644 --- a/minimap.h +++ b/minimap.h @@ -12,6 +12,8 @@ #define MM_F_CIGAR 0x04 #define MM_F_OUT_SAM 0x08 #define MM_F_NO_QUAL 0x10 +#define MM_F_OUT_CG 0x20 +#define MM_F_OUT_CS 0x40 #define MM_IDX_MAGIC "MMI\2" diff --git a/misc/README.md b/misc/README.md new file mode 100644 index 0000000..c6e3371 --- /dev/null +++ b/misc/README.md @@ -0,0 +1,28 @@ +The [K8 Javascript shell][k8] is needed to run Javascripts in this directory. +Precompiled k8 binaries for Mac and Linux can be found at the [K8 release +page][k8bin]. + +* [paf2aln.js](paf2aln.js): convert PAF to [MAF][maf] or BLAST-like output for + eyeballing. PAF has to be generated with minimap2 option `-S`, which writes + the aligned sequences to the `cs` tag. An example: + ```sh + ../minimap2 -S ../test/MT-*.fa | k8 paf2aln.js /dev/stdin + ``` + +* [mapstat.js](mapstat.js): output basic statistics such as the number of + non-redundant mapped bases, number of split and secondary alignments and + number of long gaps. This scripts seamlessly works with both SAM and PAF. + +* [sim-pbsim.js](sim-pbsim.js): convert reads simulated with [PBSIM][pbsim] to + FASTA and encode the true mapping positions to read names in a format like + `S1_33!chr1!225258409!225267761!-`. + +* [sim-eval.js](sim-eval.js): evaluate mapping accuracy for FASTA generated + with [sim-pbsim.js](sim-pbsim.js) or [sim-mason2.js](sim-mason2.js). + +* [sam2paf.js](sam2paf.js): convert SAM to PAF. + +[k8]: https://github.com/attractivechaos/k8 +[k8bin]: https://github.com/attractivechaos/k8/releases +[maf]: https://genome.ucsc.edu/FAQ/FAQformat#format5 +[pbsim]: https://github.com/pfaucon/PBSIM-PacBio-Simulator diff --git a/misc/paf2aln.js b/misc/paf2aln.js new file mode 100644 index 0000000..eea4e6d --- /dev/null +++ b/misc/paf2aln.js @@ -0,0 +1,171 @@ +var getopt = function(args, ostr) { + var oli; // option letter list index + if (typeof(getopt.place) == 'undefined') + getopt.ind = 0, getopt.arg = null, getopt.place = -1; + if (getopt.place == -1) { // update scanning pointer + if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { + getopt.place = -1; + return null; + } + if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" + ++getopt.ind; + getopt.place = -1; + return null; + } + } + var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity + if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { + if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. + if (getopt.place < 0) ++getopt.ind; + return '?'; + } + if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument + getopt.arg = null; + if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; + } else { // need an argument + if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) + getopt.arg = args[getopt.ind].substr(getopt.place); + else if (args.length <= ++getopt.ind) { // no arg + getopt.place = -1; + if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; + return '?'; + } else getopt.arg = args[getopt.ind]; // white space + getopt.place = -1; + ++getopt.ind; + } + return optopt; +} + +var c, maf_out = false, line_len = 80; +while ((c = getopt(arguments, "ml:")) != null) { + if (c == 'm') maf_out = true; + else if (c == 'l') line_len = parseInt(getopt.arg); // TODO: not implemented yet +} +if (line_len == 0) line_len = 0x7fffffff; + +if (getopt.ind == arguments.length) { + print("Usage: k8 paf2aln.js [options] "); + print("Options:"); + print(" -m MAF output (BLAST-like output by default)"); + print(" -l INT line length in BLAST-like output [80]"); + print(""); + print("Note: this script only works when minimap2 is run with option '-S'"); + exit(1); +} + +function padding_str(x, len, right) +{ + var s = x.toString(); + if (s.length < len) { + if (right) s += Array(len - s.length + 1).join(" "); + else s = Array(len - s.length + 1).join(" ") + s; + } + return s; +} + +function update_aln(s_ref, s_qry, s_mid, type, seq, slen) +{ + var l = type == '*'? 1 : seq.length; + if (type == '=') { + s_ref.set(seq); + s_qry.set(seq); + s_mid.set(Array(l+1).join("|")); + slen[0] += l, slen[1] += l; + } else if (type == '*') { + s_ref.set(seq.charAt(0)); + s_qry.set(seq.charAt(1)); + s_mid.set(' '); + slen[0] += 1, slen[1] += 1; + } else if (type == '+') { + s_ref.set(Array(l+1).join("-")); + s_qry.set(seq); + s_mid.set(Array(l+1).join(" ")); + slen[1] += l; + } else if (type == '-') { + s_ref.set(seq); + s_qry.set(Array(l+1).join("-")); + s_mid.set(Array(l+1).join(" ")); + slen[0] += l; + } +} + +function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid) +{ + print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" ")); + print(" " + s_mid.toString()); + var st, en; + if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1]; + else st = qs - slen[1], en = qs - elen[1] + 1; + print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en , 10, true)].join(" ")); +} + +var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); +var re = /([=\-\+\*])([A-Za-z]+)/g; + +var buf = new Bytes(); +var file = new File(arguments[getopt.ind]); +if (maf_out) print("##maf version=1\n"); +while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + var t = line.split("\t", 12); + if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue; + var cs = m[1]; + s_ref.length = s_qry.length = s_mid.length = 0; + var slen = [0, 0], elen = [0, 0]; + if (maf_out) { + while ((m = re.exec(cs)) != null) + update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen); + if (maf_out) { + var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; + var len = t[0].length > t[5].length? t[0].length : t[5].length; + print("a " + score); + print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false), + "+", padding_str(t[6], 10, false), s_ref.toString()].join(" ")); + var qs, qe, ql = parseInt(t[1]); + if (t[4] == '+') { + qs = parseInt(t[2]); + qe = parseInt(t[3]); + } else { + qs = ql - parseInt(t[3]); + qe = ql - parseInt(t[2]); + } + print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false), + t[4], padding_str(ql, 10, false), s_qry.toString()].join(" ")); + print(""); + } + } else { + line = line.replace(/\tc[sg]:Z:\S+/g, ""); + print('>' + line); + var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]); + var n_blocks = 0; + while ((m = re.exec(cs)) != null) { + var start = 0, rest = m[1] == '*'? 1 : m[2].length; + while (rest > 0) { + var l_proc; + if (s_ref.length + rest >= line_len) { + l_proc = line_len - s_ref.length; + update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen); + if (n_blocks > 0) print(""); + print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid); + ++n_blocks; + s_ref.length = s_qry.length = s_mid.length = 0; + slen[0] = elen[0], slen[1] = elen[1]; + } else { + l_proc = rest; + update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen); + } + rest -= l_proc, start += l_proc; + } + } + if (s_ref.length > 0) { + if (n_blocks > 0) print(""); + print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid); + ++n_blocks; + } + print("//"); + } +} +file.close(); +buf.destroy(); + +s_ref.destroy(); s_qry.destroy(); s_mid.destroy(); diff --git a/misc/sam2paf.js b/misc/sam2paf.js new file mode 100644 index 0000000..86d7c0e --- /dev/null +++ b/misc/sam2paf.js @@ -0,0 +1,111 @@ +var getopt = function(args, ostr) { + var oli; // option letter list index + if (typeof(getopt.place) == 'undefined') + getopt.ind = 0, getopt.arg = null, getopt.place = -1; + if (getopt.place == -1) { // update scanning pointer + if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { + getopt.place = -1; + return null; + } + if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" + ++getopt.ind; + getopt.place = -1; + return null; + } + } + var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity + if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { + if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. + if (getopt.place < 0) ++getopt.ind; + return '?'; + } + if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument + getopt.arg = null; + if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; + } else { // need an argument + if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) + getopt.arg = args[getopt.ind].substr(getopt.place); + else if (args.length <= ++getopt.ind) { // no arg + getopt.place = -1; + if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; + return '?'; + } else getopt.arg = args[getopt.ind]; // white space + getopt.place = -1; + ++getopt.ind; + } + return optopt; +} + +var c, pri_only = false; +while ((c = getopt(arguments, "p")) != null) + if (c == 'p') pri_only = true; + +var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]); +var buf = new Bytes(); +var re = /(\d+)([MIDSHNX=])/g; + +var len = {}, lineno = 0; +while (file.readline(buf) >= 0) { + var m, n_cigar = 0, line = buf.toString(); + ++lineno; + if (line.charAt(0) == '@') { + if (/^@SQ/.test(line)) { + var name = (m = /\tSN:(\S+)/.exec(line)) != null? m[1] : null; + var l = (m = /\tLN:(\d+)/.exec(line)) != null? parseInt(m[1]) : null; + if (name != null && l != null) len[name] = l; + } + continue; + } + var t = line.split("\t"); + var flag = parseInt(t[1]); + if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length) throw Error("ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length); + if (t[2] == '*' || (flag&4)) continue; + if (pri_only && (flag&0x100)) continue; + var tlen = len[t[2]]; + if (tlen == null) throw Error("ERROR at line " + lineno + ": can't find the length of contig " + t[2]); + var nn = (m = /\tnn:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; + var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null; + var have_NM = NM == null? false : true; + NM += nn; + var clip = [0, 0], I = [0, 0], D = [0, 0], M = 0, N = 0, ql = 0, tl = 0, mm = 0, ext_cigar = false; + while ((m = re.exec(t[5])) != null) { + var l = parseInt(m[1]); + if (m[2] == 'M') M += l, ql += l, tl += l, ext_cigar = false; + else if (m[2] == 'I') ++I[0], I[1] += l, ql += l; + else if (m[2] == 'D') ++D[0], D[1] += l, tl += l; + else if (m[2] == 'N') N += l, tl += l; + else if (m[2] == 'S') clip[M == 0? 0 : 1] = l, ql += l; + else if (m[2] == 'H') clip[M == 0? 0 : 1] = l; + else if (m[2] == '=') M += l, ql += l, tl += l, ext_cigar = true; + else if (m[2] == 'X') M += l, ql += l, tl += l, mm += l, ext_cigar = true; + ++n_cigar; + } + if (n_cigar > 65535) + warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations"); + if (tl + parseInt(t[3]) - 1 > tlen) { + warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped"); + continue; + } + if (t[9] != '*' && t[9].length != ql) { + warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped"); + continue; + } + if (!have_NM || ext_cigar) NM = I[1] + D[1] + mm; + if (NM < I[1] + D[1] + mm) { + warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]+mm) + ")"); + NM = I[1] + D[1] + mm; + } + var extra = ["mm:i:"+(NM-I[1]-D[1]), "io:i:"+I[0], "in:i:"+I[1], "do:i:"+D[0], "dn:i:"+D[1]]; + var match = M - (NM - I[1] - D[1]); + var blen = M + I[1] + D[1]; + var qlen = M + I[1] + clip[0] + clip[1]; + var qs, qe; + if (flag&16) qs = clip[1], qe = qlen - clip[0]; + else qs = clip[0], qe = qlen - clip[1]; + var ts = parseInt(t[3]) - 1, te = ts + M + D[1] + N; + var a = [t[0], qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, match, blen, t[4]]; + print(a.join("\t"), extra.join("\t")); +} + +buf.destroy(); +file.close(); diff --git a/misc/pbsim-eval.js b/misc/sim-eval.js similarity index 62% rename from misc/pbsim-eval.js rename to misc/sim-eval.js index 3386f16..07e9256 100644 --- a/misc/pbsim-eval.js +++ b/misc/sim-eval.js @@ -36,11 +36,12 @@ var getopt = function(args, ostr) { return optopt; } -var c, max_mapq = 60, mode = 0, err_out_q = 256, print_err = false, ovlp_ratio = 0.333; -while ((c = getopt(arguments, "Q:r:m:")) != null) { +var c, max_mapq = 60, mode = 0, err_out_q = 256, print_err = false, ovlp_ratio = 0.1, cap_short_mapq = false; +while ((c = getopt(arguments, "Q:r:m:c")) != null) { if (c == 'Q') err_out_q = parseInt(getopt.arg), print_err = true; else if (c == 'r') ovlp_ratio = parseFloat(getopt.arg); else if (c == 'm') mode = parseInt(getopt.arg); + else if (c == 'c') cap_short_mapq = true; } var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]); @@ -68,20 +69,27 @@ function is_correct(s, b) function count_err(qname, a, tot, err, mode) { - var s = qname.split("!"); if (a.length == 0) return; - if (s.length < 5 || (s[4] != '+' && s[4] != '-')) - throw Error("Failed to parse pbsim2fa read names '" + qname + "'"); - s[2] = parseInt(s[2]); - s[3] = parseInt(s[3]); - s.shift(); // skip pbsim orginal read name + + var m, s; + if ((m = /^(\S+)!(\S+)!(\d+)!(\d+)!([\+\-])$/.exec(qname)) != null) { // pbsim single-end reads + s = [m[1], m[2], parseInt(m[3]), parseInt(m[4]), m[5]]; + } else if ((m = /^(\S+)!(\S+)!(\d+)_(\d+)!(\d+)_(\d+)!([\+\-])([\+\-])\/([12])$/.exec(qname)) != null) { // mason2 paired-end reads + if (m[9] == '1') { + s = [m[1], m[2], parseInt(m[3]), parseInt(m[5]), m[7]]; + } else { + s = [m[1], m[2], parseInt(m[4]), parseInt(m[6]), m[8]]; + } + } else throw Error("Failed to parse simulated read names '" + qname + "'"); + s.shift(); // skip the orginal read name + if (mode == 0 || mode == 1) { // longest only or first only var max_i = 0; - if (mode == 0) { + if (mode == 0) { // longest only var max = 0; for (var i = 0; i < a.length; ++i) - if (a[i][2] - a[i][1] > max) - max = a[i][2] - a[i][1], max_i = i; + if (a[i][5] > max) + max = a[i][5], max_i = i; } var mapq = a[max_i][4]; ++tot[mapq]; @@ -92,6 +100,14 @@ function count_err(qname, a, tot, err, mode) } } else if (mode == 2) { // all primary mode var max_err_mapq = -1, max_mapq = 0, max_err_i = -1; + if (cap_short_mapq) { + var max = 0, max_q = 0; + for (var i = 0; i < a.length; ++i) + if (a[i][5] > max) + max = a[i][5], max_q = a[i][4]; + for (var i = 0; i < a.length; ++i) + a[i][4] = max_q < a[i][4]? max_q : a[i][4]; + } for (var i = 0; i < a.length; ++i) { max_mapq = max_mapq > a[i][4]? max_mapq : a[i][4]; if (!is_correct(s, a[i])) @@ -106,22 +122,53 @@ function count_err(qname, a, tot, err, mode) } } -var lineno = 0, last = null, a = []; +var lineno = 0, last = null, a = [], n_unmapped = null; +var re_cigar = /(\d+)([MIDSHN])/g; while (file.readline(buf) >= 0) { - var line = buf.toString(); + var m, line = buf.toString(); ++lineno; if (line[0] != '@') { var t = line.split("\t"); - if (last != t[0]) { - if (last != null) count_err(last, a, tot, err, mode); - a = [], last = t[0]; - } if (t[4] == '+' || t[4] == '-') { // PAF + if (last != t[0]) { + if (last != null) count_err(last, a, tot, err, mode); + a = [], last = t[0]; + } if (/\ts1:i:\d+/.test(line) && !/\ts2:i:\d+/.test(line)) // secondary alignment in minimap2 PAF continue; var mapq = parseInt(t[11]); if (mapq > max_mapq) mapq = max_mapq; - a.push([t[5], parseInt(t[7]), parseInt(t[8]), t[4], mapq]); + a.push([t[5], parseInt(t[7]), parseInt(t[8]), t[4], mapq, parseInt(t[9])]); + } else { // SAM + var flag = parseInt(t[1]); + var read_no = flag>>6&0x3; + var qname = read_no == 1 || read_no == 2? t[0] + '/' + read_no : t[0]; + if (last != qname) { + if (last != null) count_err(last, a, tot, err, mode); + a = [], last = qname; + } + if (flag&0x100) continue; // secondary alignment + if ((flag&0x4) || t[2] == '*') { // unmapped + if (n_unmapped == null) n_unmapped = 0; + ++n_unmapped; + continue; + } + var mapq = parseInt(t[4]); + if (mapq > max_mapq) mapq = max_mapq; + var pos = parseInt(t[3]) - 1, pos_end = pos; + var n_gap = 0, mlen = 0; + while ((m = re_cigar.exec(t[5])) != null) { + var len = parseInt(m[1]); + if (m[2] == 'M') pos_end += len, mlen += len; + else if (m[2] == 'I') n_gap += len; + else if (m[2] == 'D') n_gap += len, pos_end += len; + } + var score = pos_end - pos; + if ((m = /\tNM:i:(\d+)/.exec(line)) != null) { + var NM = parseInt(m[1]); + if (NM >= n_gap) score = mlen - (NM - n_gap); + } + a.push([t[2], pos, pos_end, (flag&16)? '-' : '+', mapq, score]); } } } @@ -141,3 +188,4 @@ for (var q = max_mapq; q >= 0; --q) { sum_tot2 += tot[q], sum_err2 += err[q]; } print('Q', q_out, sum_tot, sum_err, (sum_err2/sum_tot2).toFixed(9)); +if (n_unmapped != null) print('U', n_unmapped); diff --git a/misc/sim-mason2.js b/misc/sim-mason2.js new file mode 100644 index 0000000..5c66cab --- /dev/null +++ b/misc/sim-mason2.js @@ -0,0 +1,105 @@ +Bytes.prototype.reverse = function() +{ + for (var i = 0; i < this.length>>1; ++i) { + var tmp = this[i]; + this[i] = this[this.length - i - 1]; + this[this.length - i - 1] = tmp; + } +} + +// reverse complement a DNA string +Bytes.prototype.revcomp = function() +{ + if (Bytes.rctab == null) { + var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn'; + var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn'; + Bytes.rctab = []; + for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0; + for (var i = 0; i < s1.length; ++i) + Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i); + } + for (var i = 0; i < this.length>>1; ++i) { + var tmp = this[this.length - i - 1]; + this[this.length - i - 1] = Bytes.rctab[this[i]]; + this[i] = Bytes.rctab[tmp]; + } + if (this.length&1) + this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; +} + +if (arguments.length == 0) { + print("Usage: k8 sim-mason2.js "); + exit(1); +} + +function print_se(a) +{ + print('@' + a.slice(0, 5).join("!") + " " + a[8]); + print(a[5]); + print("+"); + print(a[6]); +} + +var buf = new Bytes(), buf2 = new Bytes(); +var file = new File(arguments[0]); +var re = /(\d+)([MIDSHN])/g; +var last = null; +while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (t[0].charAt(0) == '@') continue; + var m, l_ref = 0; + while ((m = re.exec(t[5])) != null) + if (m[2] == 'D' || m[2] == 'M' || m[2] == 'N') + l_ref += parseInt(m[1]); + var flag = parseInt(t[1]); + var rev = !!(flag&16); + var seq, qual; + if (rev) { + buf2.length = 0; + buf2.set(t[9], 0); + buf2.revcomp(); + seq = buf2.toString(); + buf2.set(t[10], 0); + buf2.reverse(); + qual = buf2.toString(); + } else seq = t[9], qual = t[10]; + var qname = t[0]; + qname = qname.replace(/^simulated./, ""); + var chr = t[2]; + var pos = parseInt(t[3]) - 1; + var strand = (flag&16)? '-' : '+'; + var read_no = flag&0xc0; + if (read_no == 0x40) read_no = 1; + else if (read_no == 0x80) read_no = 2; + else read_no = 0; + var err = 0, snp = 0, indel = 0; + for (var i = 11; i < t.length; ++i) { + if ((m = /^XE:i:(\d+)/.exec(t[i])) != null) err = m[1]; + else if ((m = /^XS:i:(\d+)/.exec(t[i])) != null) snp = m[1]; + else if ((m = /^XI:i:(\d+)/.exec(t[i])) != null) indel = m[1]; + } + var comment = [err, snp, indel].join(":"); + if (last == null) { + last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment]; + } else if (last[0] != qname) { + print_se(last); + last = [qname, chr, pos, pos + l_ref, strand, seq, qual, read_no, comment]; + } else { + if (read_no == 2) { // last[] is the first read + if (last[7] != 1) throw Error("ERROR: can't find read1"); + var name = [qname, chr, last[2] + "_" + pos, last[3] + "_" + (pos + l_ref), last[4] + strand].join("!"); + print('@' + name + '/1' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]); + print('@' + name + '/2' + ' ' + comment); print(seq); print("+"); print(qual); + } else { + if (last[7] != 2) throw Error("ERROR: can't find read2"); + var name = [qname, chr, pos + "_" + last[2], (pos + l_ref) + "_" + last[3], strand + last[4]].join("!"); + print('@' + name + '/1' + ' ' + comment); print(seq); print("+"); print(qual); + print('@' + name + '/2' + ' ' + last[8]); print(last[5]); print("+"); print(last[6]); + } + last = null; + } +} +if (last != null) print_se(last); +file.close(); +buf.destroy(); +buf2.destroy(); diff --git a/misc/pbsim2fa.js b/misc/sim-pbsim.js similarity index 96% rename from misc/pbsim2fa.js rename to misc/sim-pbsim.js index b5629c5..1c92e21 100644 --- a/misc/pbsim2fa.js +++ b/misc/sim-pbsim.js @@ -28,7 +28,7 @@ Bytes.prototype.revcomp = function() } if (arguments.length < 2) { - print("Usage: k8 pbsim2paf.js [[pbsim2.maf] ...]"); + print("Usage: k8 sim-pbsim.js [[pbsim2.maf] ...]"); exit(1); } diff --git a/mmpriv.h b/mmpriv.h index dcb78d6..32b3fd1 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); diff --git a/tex/Makefile b/tex/Makefile new file mode 100644 index 0000000..418a962 --- /dev/null +++ b/tex/Makefile @@ -0,0 +1,21 @@ +.SUFFIXES: .gp .tex .eps .pdf .eps.gz + +.eps.pdf: + epstopdf --outfile $@ $< + +.eps.gz.pdf: + gzip -dc $< | epstopdf --filter > $@ + +.pdf.eps: + pdftops -eps $< $@ + +all:minimap2.pdf + +roc-color.eps:roc.gp + gnuplot roc.gp + +minimap2.pdf:minimap2.tex minimap2.bib roc-color.pdf + pdflatex minimap2; bibtex minimap2; pdflatex minimap2; pdflatex minimap2; + +clean: + rm -fr *.toc *.aux *.bbl *.blg *.idx *.log *.out *~ minimap2.pdf diff --git a/tex/bioinfo.cls b/tex/bioinfo.cls new file mode 100644 index 0000000..48f7866 --- /dev/null +++ b/tex/bioinfo.cls @@ -0,0 +1,930 @@ +\newcommand\classname{bioinfo} +\newcommand\lastmodifieddate{2003/02/08} +\newcommand\versionnumber{0.1} + +% Are we printing crop marks? +\newif\if@cropmarkson \@cropmarksontrue + +\NeedsTeXFormat{LaTeX2e}[2001/06/01] +\ProvidesClass{\classname}[\lastmodifieddate\space\versionnumber] + +\setlength{\paperheight}{11truein} +\setlength{\paperwidth}{8.5truein} + +\newif\if@final + +\DeclareOption{draft}{\PassOptionsToPackage{draft}{graphicx}} +\DeclareOption{a4paper}{\PassOptionsToPackage{a4}{crop}} +\DeclareOption{centre}{\PassOptionsToPackage{center}{crop}} +\DeclareOption{crop}{\PassOptionsToPackage{cam}{crop}\global\@cropmarksontrue} +\DeclareOption{nocrop}{\PassOptionsToPackage{off}{crop}\global\@cropmarksonfalse} +\DeclareOption{info}{\PassOptionsToPackage{info}{crop}} +\DeclareOption{noinfo}{\PassOptionsToPackage{noinfo}{crop}} +\DeclareOption{final}{\global\@finaltrue} + +\ExecuteOptions{a4paper,nocrop,centre,info} + +\ProcessOptions + +% Load all necessary packages +\RequirePackage{inputenc,crop,graphicx,amsmath,array,color,amssymb,flushend,stfloats,amsthm,chngpage,times} +%\RequirePackage[LY1]{fontenc} +%\RequirePackage[LY1,mtbold]{mathtime} +\def\authoraffliate{\fontfamily{phv}\selectfont} +\def\helvetica{\fontfamily{phv}\selectfont} +\def\helveticaitalic{\fontfamily{phv}\itshape\selectfont} +\def\helveticabold{\fontfamily{phv}\bfseries\selectfont} +\def\helveticabolditalic{\fontfamily{phv}\bfseries\itshape\selectfont} + +% Not sure if needed. +\newcommand\@ptsize{0} + +% Set twoside printing +\@twosidetrue + +% Marginal notes are on the outside edge +\@mparswitchfalse + +\reversemarginpar + +\renewcommand\normalsize{% + \@setfontsize\normalsize{9}{11}% + \abovedisplayskip 10\p@ \@plus2\p@ \@minus5\p@ + \abovedisplayshortskip \z@ \@plus3\p@ + \belowdisplayshortskip 6\p@ \@plus3\p@ \@minus3\p@ + \belowdisplayskip \abovedisplayskip + \let\@listi\@listI} +\normalsize +\let\@bls\baselineskip + +\newcommand\small{% + \@setfontsize\small{9}{11}% + \abovedisplayskip 11\p@ minus 3\p@ + \belowdisplayskip \abovedisplayskip + \abovedisplayshortskip \z@ plus 2\p@ + \belowdisplayshortskip 4\p@ plus 2\p@ minus2\p@ + \def\@listi{\topsep 4.5\p@ plus 2\p@ minus 1\p@ + \itemsep \parsep + \topsep 4\p@ plus 2\p@ minus 2\p@}} + +\newcommand\footnotesize{% + \@setfontsize\footnotesize{8}{10}% + \abovedisplayskip 6\p@ minus 3\p@ + \belowdisplayskip\abovedisplayskip + \abovedisplayshortskip \z@ plus 3\p@ + \belowdisplayshortskip 6\p@ plus 3\p@ minus 3\p@ + \def\@listi{\topsep 3\p@ plus 1\p@ minus 1\p@ + \parsep 2\p@ plus 1\p@ minus 1\p@\itemsep \parsep}} + +\def\scriptsize{\@setfontsize\scriptsize{7pt}{9pt}} +\def\tiny{\@setfontsize\tiny{5pt}{7pt}} +\def\large{\@setfontsize\large{11.5pt}{12pt}} +\def\Large{\@setfontsize\Large{14pt}{16}} +\def\LARGE{\@setfontsize\LARGE{15pt}{17pt}} +\def\huge{\@setfontsize\huge{22pt}{22pt}} +\def\Huge{\@setfontsize\Huge{30pt}{30pt}} + +\DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm} +\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} +\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} +\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} +\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} +\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} +\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} + +% Line spacing +\setlength\lineskip{1\p@} +\setlength\normallineskip{1\p@} +\renewcommand\baselinestretch{} + +% Paragraph dimensions and inter-para spacing +\setlength\parskip{0\p@} +\setlength\parindent{3mm} + +% Set inter-para skips +\setlength\smallskipamount{3\p@ \@plus 1\p@ \@minus 1\p@} +\setlength\medskipamount{6\p@ \@plus 2\p@} +\setlength\bigskipamount{12\p@ \@plus 4\p@ \@minus 4\p@} + +% Page break penalties +\@lowpenalty 51 +\@medpenalty 151 +\@highpenalty 301 + +% Disallow widows and orphans +\clubpenalty 10000 +\widowpenalty 10000 + +% Disable page breaks before equations, allow pagebreaks after +% equations and discourage widow lines before equations. +\displaywidowpenalty 100 +\predisplaypenalty 10000 +\postdisplaypenalty 2500 + +% Allow breaking the page in the middle of a paragraph +\interlinepenalty 0 + +% Disallow breaking the page after a hyphenated line +\brokenpenalty 10000 + +% Hyphenation; don't split words into less than three characters +\lefthyphenmin=3 +\righthyphenmin=3 + +% +% Set page layout dimensions +% +\setlength\headheight{16\p@} % height of running head +\setlength\topmargin{2.9pc} % head margin +\addtolength\topmargin{-1in} % subtract out the 1 inch driver margin + +\setlength\topskip{10\p@} % height of first line of text +\setlength\headsep{19\p@} % space below running head -- + +\setlength\footskip{34\p@} % space above footer line +\setlength\maxdepth{.5\topskip} % pages can be short or deep by half a line? + +\setlength\textwidth{42pc} % text measure excluding margins + +\setlength\textheight{58\baselineskip} % 54 lines on a full page, +\addtolength\textheight{\topskip} % including the first + % line on the page + +% Set the margins +\setlength\marginparsep{3\p@} +\setlength\marginparpush{3\p@} +\setlength\marginparwidth{35\p@} + +\setlength\oddsidemargin{4.5pc} +\addtolength\oddsidemargin{-1in} % subtract out the 1 inch driver margin +\setlength\@tempdima{\paperwidth} +\addtolength\@tempdima{-\textwidth} +\addtolength\@tempdima{-4.5pc} +\setlength\evensidemargin{\@tempdima} +\addtolength\evensidemargin{-1in} + +\setlength\columnsep{1.5pc} % space between columns for double-column text +\setlength\columnseprule{0\p@} % width of rule between two columns + +% Footnotes +\setlength\footnotesep{9\p@} % space between footnotes +% space between text and footnote +\setlength{\skip\footins}{12\p@ \@plus 6\p@ \@minus 1\p@} + +% Float placement parameters + +% The total number of floats that can be allowed on a page. +\setcounter{totalnumber}{10} +% The maximum number of floats at the top and bottom of a page. +\setcounter{topnumber}{5} +\setcounter{bottomnumber}{5} +% The maximum part of the top or bottom of a text page that can be +% occupied by floats. This is set so that at least four lines of text +% fit on the page. +\renewcommand\topfraction{.9} +\renewcommand\bottomfraction{.9} +% The minimum amount of a text page that must be occupied by text. +% This should accomodate four lines of text. +\renewcommand\textfraction{.06} +% The minimum amount of a float page that must be occupied by floats. +\renewcommand\floatpagefraction{.94} + +% The same parameters repeated for double column output +\renewcommand\dbltopfraction{.9} +\renewcommand\dblfloatpagefraction{.9} + +% Space between floats +\setlength\floatsep {12\p@ \@plus 2\p@ \@minus 2\p@} +% Space between floats and text +\setlength\textfloatsep{20\p@ \@plus 2\p@ \@minus 4\p@} +% Space above and below an inline figure +\setlength\intextsep {18\p@ \@plus 2\p@ \@minus 2\p@} + +% For double column floats +\setlength\dblfloatsep {12\p@ \@plus 2\p@ \@minus 2\p@} +\setlength\dbltextfloatsep{20\p@ \@plus 2\p@ \@minus 4\p@} + +% Space left at top, bottom and inbetween floats on a float page. +\setlength\@fptop{0\p@} % no space above float page figures +\setlength\@fpsep{12\p@ \@plus 1fil} +\setlength\@fpbot{0\p@} + +% The same for double column +\setlength\@dblfptop{0\p@} +\setlength\@dblfpsep{12\p@ \@plus 1fil} +\setlength\@dblfpbot{0\p@} + +% Override settings in mathtime back to TeX defaults +\DeclareMathSizes{5} {5} {5} {5} +\DeclareMathSizes{6} {6} {5} {5} +\DeclareMathSizes{7} {7} {5} {5} +\DeclareMathSizes{8} {8} {6} {5} +\DeclareMathSizes{9} {9} {6.5} {5} +\DeclareMathSizes{10} {10} {7.5} {5} +\DeclareMathSizes{12} {12} {9} {7} + +% Page styles +\def\ps@headings + {% + \def\@oddfoot{\vbox to 12.5\p@{\hbox{\rule{\textwidth}{0.5\p@}}\vss + \hbox to \textwidth{\hfill\helveticabold\small\thepage}% + }}% + \def\@evenfoot{\vbox to 12.5\p@{\rule{\textwidth}{0.5\p@}\vss + \hbox to \textwidth{\helveticabold\small\thepage\hfill}% + }}% + \def\@evenhead{\vbox{\hbox to \textwidth{\fontsize{8}{10}\selectfont + \helveticabold{\fontshape{it}\selectfont + \strut\leftmark}\hfill}\vspace{6.5\p@}\rule{\textwidth}{0.5\p@}}}% + \def\@oddhead{\vbox{\hbox to \textwidth{\hfill\fontsize{8}{10}\selectfont + \helveticabold{\fontshape{it}\selectfont\strut\rightmark}}% + \vspace{6.5\p@}\rule{\textwidth}{0.5\p@}}}% + \def\titlemark##1{\markboth{##1}{##1}}% + \def\authormark##1{\gdef\leftmark{##1}}% + } + +\def\ps@opening + {% + \def\@oddfoot{\vbox to 13\p@{\hbox{\rule{\textwidth}{1\p@}}\vss + \hbox to \textwidth{\helvetica + \fontsize{7}{9}\fontshape{n}\selectfont% + \hfill\small\helveticabold\thepage}% + }}% + \def\@evenfoot{\vbox to 13\p@{\rule{\textwidth}\vss + \hbox to \textwidth{\helvetica\thepage\hfill + \fontsize{7}{9}\fontshape{n}\selectfont}% + }}% + \let\@evenhead\relax + \let\@oddhead\relax} + +% Page range +\newif\iflastpagegiven \lastpagegivenfalse +\newcommand\firstpage[1]{% + \gdef\@firstpage{#1}% + \ifnum\@firstpage>\c@page + \setcounter{page}{#1}% + \ClassWarning{BIO}{Increasing pagenumber to \@firstpage}% + \else \ifnum\@firstpage<\c@page + \ClassWarning{BIO}{Firstpage lower than pagenumber}\fi\fi + \xdef\@firstpage{\the\c@page}% + } +\def\@firstpage{1} +\def\pagenumbering#1{% + \global\c@page \@ne + \gdef\thepage{\csname @#1\endcsname \c@page}% + \gdef\thefirstpage{% + \csname @#1\endcsname \@firstpage}% + \gdef\thelastpage{% + \csname @#1\endcsname \@lastpage}% + } + +\newcommand\lastpage[1]{\xdef\@lastpage{#1}% + \global\lastpagegiventrue} +\def\@lastpage{0} +\def\setlastpage{\iflastpagegiven\else + \edef\@tempa{@lastpage@}% + \expandafter + \ifx \csname \@tempa \endcsname \relax + \gdef\@lastpage{0}% + \else + \xdef\@lastpage{\@nameuse{@lastpage@}}% + \fi + \fi } +\def\writelastpage{% + \iflastpagegiven \else + \immediate\write\@auxout% + {\string\global\string\@namedef{@lastpage@}{\the\c@page}}% + \fi + } +\def\thepagerange{% + \ifnum\@lastpage =0 {\ \bf ???} \else + \ifnum\@lastpage = \@firstpage \ \thefirstpage\else + \thefirstpage--\thelastpage \fi\fi} + +\AtBeginDocument{\setlastpage + \pagenumbering{arabic}% + } +\AtEndDocument{% + \writelastpage + \if@final + \clearemptydoublepage + \else + \clearpage + \fi} + +% +% Sectional units +% + +% Counters +\newcounter{section} +\newcounter{subsection}[section] +\newcounter{subsubsection}[subsection] +\newcounter{paragraph}[subsubsection] +\newcounter{subparagraph}[paragraph] +\newcounter{figure} +\newcounter{table} + +% Form of the numbers +\newcommand\thepage{\arabic{page}} +\renewcommand\thesection{\arabic{section}} +\renewcommand\thesubsection{{\thesection.\arabic{subsection}}} +\renewcommand\thesubsubsection{{\thesubsection.\arabic{subsubsection}}} +\renewcommand\theparagraph{\thesubsubsection.\arabic{paragraph}} +\renewcommand\thesubparagraph{\theparagraph.\arabic{subparagraph}} +\renewcommand\theequation{\arabic{equation}} + +% Form of the words +\newcommand\contentsname{Contents} +\newcommand\listfigurename{List of Figures} +\newcommand\listtablename{List of Tables} +\newcommand\partname{Part} +\newcommand\appendixname{Appendix} +\newcommand\abstractname{Abstract} +\newcommand\refname{References} +\newcommand\bibname{References} +\newcommand\indexname{Index} +\newcommand\figurename{Fig.} +\newcommand\tablename{Table} + +% Clearemptydoublepage should really clear the running heads too +\newcommand{\clearemptydoublepage}{\newpage{\pagestyle{empty}\cleardoublepage}} + +% Frontmatter, mainmatter and backmatter + +\newif\if@mainmatter \@mainmattertrue + +\newcommand\frontmatter{% + \clearpage + \@mainmatterfalse + \pagenumbering{roman}} + +\newcommand\mainmatter{% + \clearpage + \@mainmattertrue + \pagenumbering{arabic}} + +\newcommand\backmatter{% + \clearpage + \@mainmatterfalse} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TITLE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\newlength{\dropfromtop} +\setlength{\dropfromtop}{\z@} + +% Application Notes +\newif\if@appnotes +\newcommand{\application}{% +% \setlength{\dropfromtop}{-2.25pc}% + \global\@appnotestrue} + +\long\def\title{\@ifnextchar[{\short@title}{\@@title}} +\def\short@title[#1]{\titlemark{#1}\@@@title} +\def\@@title#1{\authormark{#1}\@@@title{#1}} +\long\def\@@@title#1{\gdef\@title{#1}} + +\long\def\author{\@ifnextchar[{\short@uthor}{\@uthor}} +\def\short@uthor[#1]{\authormark{#1}\@@author} +\def\@uthor#1{\authormark{#1}\@@author{#1}} +\long\def\@@author#1{\gdef\@author{#1}} + +\def\vol#1{\global\def\@vol{#1}} +\def\issue#1{\global\def\@issue{#1}} +\def\address#1{\global\def\@issue{#1}} +\def\history#1{\global\def\@history{#1}} +\def\editor#1{\global\def\@editor{#1}} +\def\pubyear#1{\global\def\@pubyear{#1}} +\def\copyrightyear#1{\global\def\@copyrightyear{#1}} +\def\address#1{\global\def\@address{#1}} +\def\DOI#1{\global\def\@DOI{#1}} + +\definecolor{gray}{cmyk}{0, 0, 0, 0.15} +\newlength{\extraspace} +\setlength{\extraspace}{\z@} + +\newcommand\maketitle{\par + \begingroup + \renewcommand\thefootnote{\@fnsymbol\c@footnote}% + \def\@makefnmark{\rlap{\@textsuperscript{\normalfont\@thefnmark}}}% + \long\def\@makefntext##1{\parindent 3mm\noindent +% \@textsuperscript{\normalfont\@thefnmark}\raggedright##1}% + \@textsuperscript{\normalfont\@thefnmark}##1}% + \if@twocolumn + \ifnum \col@number=\@ne + \@maketitle + \else + \twocolumn[\@maketitle]% + \fi + \else + \newpage + \global\@topnum\z@ % Prevents figures from going at top of page. + \@maketitle + \fi + \thispagestyle{opening}\@thanks + \endgroup + \setcounter{footnote}{0}% + \global\let\thanks\relax + \global\let\maketitle\relax + \global\let\@maketitle\relax + \global\let\@address\@empty + \global\let\@history\@empty + \global\let\@editor\@empty + \global\let\@thanks\@empty + \global\let\@author\@empty + \global\let\@date\@empty + \global\let\@title\@empty + \global\let\@pubyear\@empty + \global\let\address\relax + \global\let\history\relax + \global\let\editor\relax + \global\let\title\relax + \global\let\author\relax + \global\let\date\relax + \global\let\pubyear\relax + \global\let\@copyrightline\@empty + \global\let\and\relax + \@afterindentfalse\@afterheading +} + +\newlength{\aboveskipchk}%for checking oddpage or evenpage top skip +\setlength{\aboveskipchk}{\z@}% + +\def\@maketitle{% + \let\footnote\thanks + \clearemptydoublepage + \checkoddpage\ifcpoddpage\setlength{\aboveskipchk}{-3pc}\else\setlength{\aboveskipchk}{-5pc}\fi%for checking oddpage or evenpage top skip%% + \vspace*{\aboveskipchk}% + \vspace{\dropfromtop}% + \hbox to \textwidth{% + {\helvetica\itshape\bfseries\fontsize{19}{12}\selectfont {\color{gray}TECHNICAL REPORT} + \hfil + \if@appnotes APPLICATIONS NOTE\hfil\fi + }% +\enskip \parbox[b]{11.3pc}{% + \helvetica + \flushright\fontsize{8}{10}\fontshape{it}\selectfont + \hfill + }} + \rule{\textwidth}{1\p@}\par% + \helvetica + \hbox to \textwidth{% + \parbox[t]{41pc}{% + \vspace*{1sp} + {\helveticabold\fontsize{16}{21}\selectfont\raggedright \@title \par}% + \vspace{4.5\p@} + {\authoraffliate\fontsize{11}{13}\selectfont\raggedright \@author \par}% + \vspace{4\p@} + {\authoraffliate\fontsize{9}{11}\selectfont\raggedright \@address \par}% + \vspace{4\p@} + %{\helvetica\fontsize{8}{10}\selectfont\raggedright \@history \par} + %\vspace{24\p@} + %{\helvetica\fontsize{10}{12}\selectfont\raggedright \@editor \par} + %\vspace{20\p@} + }% + } + \vspace{4.5\p@}% + \rule{\textwidth}{1\p@}% + \vspace{12\p@ plus 6\p@ minus 6\p@}% + \vspace{\extraspace} + } +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%% Abstract %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\newcommand{\absection}[1]{% + \par\noindent{\bfseries #1}\space\ignorespaces} + +\newenvironment{abstract}{% + \begingroup + \let\section\absection + \fontfamily{\sfdefault}\fontsize{8}{11}\sffamily\selectfont + {\fontseries{b}\selectfont ABSTRACT}\par} +{\endgroup\bigskip\@afterheading\@afterindentfalse\vskip 12pt plus 3pt minus 1pt} + +% Section macros + +% Lowest level heading that takes a number by default +\setcounter{secnumdepth}{3} + +\renewcommand{\@seccntformat}[1]{\csname the#1\endcsname\quad} + +\def\section{% + \@startsection{section}{1}{\z@} + {-22\p@ plus -3\p@}{3\p@} + {\reset@font\raggedright\helveticabold\fontsize{10}{12}\selectfont\MakeUppercase}} + +\def\subsection{% + \@startsection{subsection}{2}{\z@} + {-11\p@ plus -2\p@}{3\p@} + {\reset@font\raggedright\mathversion{bold}\fontseries{b}\fontsize{10}{12}\selectfont}} + +\def\subsubsection{% + \@startsection{subsubsection}{3}{\z@} + %{-11\p@ plus -1\p@}{-1em} + {-11\p@ plus -1\p@}{0.001em} + {\reset@font\normalfont\normalsize\itshape}} + +\def\textcolon{\text{\rm :}} + + \def\paragraph{% + \@startsection{paragraph}{4}{\z@} + {-6\p@} + {-.4em} + {\reset@font\itshape}} + +% ******************** +% Figures and tables * +% ******************** + +% Table and array parameters +\setlength\arraycolsep{.5em} +\setlength\tabcolsep{.5em} +\setlength\arrayrulewidth{.5pt} +\setlength\doublerulesep{2.5pt} +\setlength\extrarowheight{\z@} +\renewcommand\arraystretch{1} + +\newlength{\abovecaptionskip} +\newlength{\belowcaptionskip} +\setlength{\abovecaptionskip}{13pt} +\setlength{\belowcaptionskip}{10.5pt} + +\long\def\@makecaption#1#2{\vspace{\abovecaptionskip}% + \begingroup + \footnotesize + \textbf{#1.}\enskip{#2}\par + \endgroup} + +\long\def\@tablecaption#1#2{% + \begingroup + \footnotesize + \textbf{#1.}\enskip{#2\strut\par} + \endgroup\vspace{\belowcaptionskip}} + +% Table rules +\def\toprule{\noalign{\ifnum0=`}\fi\hrule \@height 0.5pt \hrule \@height 6pt \@width 0pt \futurelet + \@tempa\@xhline} +\def\midrule{\noalign{\ifnum0=`}\fi \hrule \@height 6.75pt \@width 0pt \hrule \@height 0.5pt + \hrule \@height 6pt \@width 0pt \futurelet \@tempa\@xhline} +\def\botrule{\noalign{\ifnum0=`}\fi \hrule \@height 5.75pt \@width 0pt \hrule \@height 0.5pt \futurelet + \@tempa\@xhline} +\def\hrulefill{\leavevmode\leaders\hrule height .5pt\hfill\kern\z@} + +\def\thefigure{\@arabic\c@figure} +\def\fps@figure{tbp} +\def\ftype@figure{1} +\def\ext@figure{lof} +\def\fnum@figure{\figurename~\thefigure} +\def\figure{\@float{figure}} +\let\endfigure\end@float +\@namedef{figure*}{\@dblfloat{figure}} +\@namedef{endfigure*}{\end@dblfloat} +\def\thetable{\@arabic\c@table} +\def\fps@table{tbp} +\def\ftype@table{2} +\def\ext@table{lot} +\def\fnum@table{Table~\thetable} +\def\table{\let\@makecaption\@tablecaption\let\source\tablesource\@float{table}} +\def\endtable{\end@float} +\@namedef{table*}{\let\@makecaption\@tablecaption\@dblfloat{table}} +\@namedef{endtable*}{\end@dblfloat} + +\newif\if@rotate \@rotatefalse +\newif\if@rotatecenter \@rotatecenterfalse +\def\rotatecenter{\global\@rotatecentertrue} +\def\rotateendcenter{\global\@rotatecenterfalse} +\def\rotate{\global\@rotatetrue} +\def\endrotate{\global\@rotatefalse} +\newdimen\rotdimen +\def\rotstart#1{\special{ps: gsave currentpoint currentpoint translate + #1 neg exch neg exch translate}} +\def\rotfinish{\special{ps: currentpoint grestore moveto}} +\def\rotl#1{\rotdimen=\ht#1\advance\rotdimen by \dp#1 + \hbox to \rotdimen{\vbox to\wd#1{\vskip \wd#1 + \rotstart{270 rotate}\box #1\vss}\hss}\rotfinish} +\def\rotr#1{\rotdimen=\ht #1\advance\rotdimen by \dp#1 + \hbox to \rotdimen{\vbox to \wd#1{\vskip \wd#1 + \rotstart{90 rotate}\box #1\vss}\hss}\rotfinish} + +\newdimen\tempdime +\newbox\temptbox + +% From ifmtarg.sty +% Copyright Peter Wilson and Donald Arseneau, 2000 +\begingroup +\catcode`\Q=3 +\long\gdef\@ifmtarg#1{\@xifmtarg#1QQ\@secondoftwo\@firstoftwo\@nil} +\long\gdef\@xifmtarg#1#2Q#3#4#5\@nil{#4} +\long\gdef\@ifnotmtarg#1{\@xifmtarg#1QQ\@firstofone\@gobble\@nil} +\endgroup + +\def\tablesize{\@setfontsize\tablesize{8\p@}{10\p@}} + +\newenvironment{processtable}[3]{\setbox\temptbox=\hbox{{\tablesize #2}}% +\tempdime\wd\temptbox\@processtable{#1}{#2}{#3}{\tempdime}} +{\relax} + +\newcommand{\@processtable}[4]{% +\if@rotate +\setbox4=\vbox to \hsize{\vss\hbox to \textheight{% +\begin{minipage}{#4}% +\@ifmtarg{#1}{}{\caption{#1}}{\tablesize #2}% +\vskip7\p@\noindent +\parbox{#4}{\fontsize{7}{9}\selectfont #3\par}% +\end{minipage}}\vss}% +\rotr{4} +\else +\hbox to \hsize{\hss\begin{minipage}[t]{#4}% +\vskip2.9pt +\@ifmtarg{#1}{}{\caption{#1}}{\tablesize #2}% +\vskip6\p@\noindent +\parbox{#4}{\fontsize{7}{9}\selectfont #3\par}% +\end{minipage}\hss}\fi}% + +\newcolumntype{P}[1]{>{\raggedright\let\\\@arraycr\hangindent1em}p{#1}} + +% ****************************** +% List numbering and lettering * +% ****************************** +\def\labelenumi{{\rm\arabic{enumi}.}} +\def\theenumi{\arabic{enumi}} +\def\labelenumii{{\rm\alph{enumii}.}} +\def\theenumii{\alph{enumii}} +\def\p@enumii{\theenumi} +\def\labelenumiii{{\rm(\arabic{enumiii})}} +\def\theenumiii{\roman{enumiii}} +\def\p@enumiii{\theenumi(\theenumii)} +\def\labelenumiv{{\rm(\arabic{enumiv})}} +\def\theenumiv{\Alph{enumiv}} +\def\p@enumiv{\p@enumiii\theenumiii} +\def\labelitemi{{\small$\bullet$}} +\def\labelitemii{{\small$\bullet$}} +\def\labelitemiii{{\small$\bullet$}} +\def\labelitemiv{{\small$\bullet$}} + +\def\@listI{\leftmargin\leftmargini \topsep\medskipamount} +\let\@listi\@listI +\@listi +\def\@listii{\topsep\z@\leftmargin\leftmarginii} +\def\@listiii{\leftmargin\leftmarginiii \topsep\z@} +\def\@listiv{\leftmargin\leftmarginiv \topsep\z@} +\def\@listv{\leftmargin\leftmarginv \topsep\z@} +\def\@listvi{\leftmargin\leftmarginvi \topsep\z@} + +\setlength{\leftmargini}{3mm} +\setlength{\leftmarginii}{\z@} +\setlength{\leftmarginiii}{\z@} +\setlength{\leftmarginiv}{\z@} + +% Changes to the list parameters for enumerate +\def\enumargs{% + \partopsep \z@ + \itemsep 3\p@ + \parsep \z@ + \labelsep 0.5em + \listparindent \parindent + \itemindent \z@ + \topsep 11\p@ +} + +\def\enumerate{% + \@ifnextchar[{\@numerate}{\@numerate[0]}} + +\def\@numerate[#1]{% + \ifnum \@enumdepth >3 \@toodeep\else + \advance\@enumdepth \@ne + \edef\@enumctr{enum\romannumeral\the\@enumdepth} + \list{\csname label\@enumctr\endcsname}{% + \enumargs + \setlength{\leftmargin}{\csname leftmargin\romannumeral\the\@enumdepth\endcsname} + \usecounter{\@enumctr} + \settowidth\labelwidth{#1} + \addtolength{\leftmargin}{\labelwidth} + \addtolength{\leftmargin}{\labelsep} + \def\makelabel##1{\hss \llap{##1}}}% + \fi + } +\let\endenumerate\endlist + +% Changes to the list parameters for itemize +\def\itemargs{% + \partopsep \z@ + \itemsep 3\p@ + \parsep \z@ + \labelsep 0.5em + \rightmargin \z@ + \listparindent \parindent + \itemindent \z@ + \topsep11\p@ +} + +\def\itemize{% + \@ifnextchar[{\@itemize}{\@itemize[$\bullet$]}} + +\def\@itemize[#1]{% + \ifnum \@itemdepth >3 \@toodeep\else + \advance\@itemdepth \@ne + \edef\@itemctr{item\romannumeral\the\@itemdepth} + \list{\csname label\@itemctr\endcsname}{% + \itemargs + \setlength{\leftmargin}{\csname leftmargin\romannumeral\the\@itemdepth\endcsname} + \settowidth\labelwidth{#1} + \addtolength{\leftmargin}{\labelwidth} + \addtolength{\leftmargin}{\labelsep} + \def\makelabel##1{\hss \llap{##1}}}% + \fi + } +\let\enditemize\endlist + +\newenvironment{unlist}{% + \begin{list}{}% + {\setlength{\labelwidth}{\z@}% + \setlength{\labelsep}{\z@}% + \setlength{\topsep}{\medskipamount}% + \setlength{\itemsep}{3\p@}% + \setlength{\leftmargin}{2em}% + \setlength{\itemindent}{-2em}}} +{\end{list}} + + +% *********************** +% Quotes and Quotations * +% *********************** +\def\quotation{\par\begin{list}{}{ + \setlength{\topsep}{\medskipamount} + \setlength{\leftmargin}{2em}% + \setlength{\rightmargin}{\z@}% + \setlength\labelwidth{0pt}% + \setlength\labelsep{0pt}% + \listparindent\parindent}% + \item[]} +\def\endquotation{\end{list}} +\let\quote\quotation +\let\endquote\endquotation + +\skip\@mpfootins = \skip\footins +\fboxsep=6\p@ +\fboxrule=1\p@ + +% ******************* +% Table of contents * +% ******************* +\newcommand\@pnumwidth{4em} +\newcommand\@tocrmarg{2.55em plus 1fil} +\newcommand\@dotsep{1000} +\setcounter{tocdepth}{4} + +\def\numberline#1{\hbox to \@tempdima{{#1}}} + +\def\@authortocline#1#2#3#4#5{% + \vskip 1.5\p@ + \ifnum #1>\c@tocdepth \else + {\leftskip #2\relax \rightskip \@tocrmarg \parfillskip -\rightskip + \parindent #2\relax\@afterindenttrue + \interlinepenalty\@M + \leavevmode + \@tempdima #3\relax + \advance\leftskip \@tempdima \null\nobreak\hskip -\leftskip + {\itshape #4}\nobreak + \leaders\hbox{$\m@th + \mkern \@dotsep mu\hbox{.}\mkern \@dotsep + mu$}\hfill + \nobreak + \hb@xt@\@pnumwidth{\hfil}% + \par}% + \fi} + +\newcommand*\l@author{\@authortocline{2}{0pt}{30pt}} +\newcommand*\l@section{\@dottedtocline{3}{11pt}{20pt}} +\newcommand*\l@subsection{\@dottedtocline{4}{31pt}{29pt}} +\newcommand*\l@subsubsection[2]{} + + + +% *********** +% Footnotes * +% *********** + +\def\footnoterule{\noindent\rule{\columnwidth}{0.5pt}} +\def\@makefnmark{\@textsuperscript{\normalfont\@thefnmark}}% +\newcommand\@makefntext[1]{\noindent{\@makefnmark}\enskip#1} + +% *********** +% References * +% *********** + +\providecommand{\newblock}{} +\newenvironment{thebibliography}{% + \section{\bibname}% + \begingroup + \small + \begin{list}{}{% + \setlength{\topsep}{\z@}% + \setlength{\labelsep}{\z@}% + \settowidth{\labelwidth}{\z@}% + \setlength{\leftmargin}{4mm}% + \setlength{\itemindent}{-4mm}}\small} +{\end{list}\endgroup} + +\RequirePackage{natbib} + +% ********** +% Appendix * +% ********** +\newif\ifappend % Are we in the Appendix? +\def\appendix{\par + \setcounter{section}{0} + \setcounter{subsection}{0} + \appendtrue +} + +%Math parameters + +\setlength{\jot}{5\p@} +\mathchardef\@m=1500 % adapted value + +\def\frenchspacing{\sfcode`\.\@m \sfcode`\?\@m \sfcode`\!\@m + \sfcode`\:\@m \sfcode`\;\@m \sfcode`\,\@m} + +% Theorems +\def\th@plain{% +%% \let\thm@indent\noindent % no indent +\thm@headfont{\quad\scshape}% heading font is bold +\thm@notefont{\upshape\mdseries}% same as heading font +\thm@headpunct{.}% no period after heading +\thm@headsep 5\p@ plus\p@ minus\p@\relax +%% \let\thm@swap\@gobble +%% \thm@preskip\topsep +%% \thm@postskip\theorempreskipamount +\itshape % body font +} + +\vbadness=9999 +\tolerance=9999 +\doublehyphendemerits=10000 +\doublehyphendemerits 640000 % corresponds to badness 800 +\finalhyphendemerits 1000000 % corresponds to badness 1000 + +\flushbottom +\frenchspacing +\ps@headings +\twocolumn + +% Screen PDF compatability +\newcommand{\medline}[1]{% + \unskip\unskip\ignorespaces} + + +%%%%for smaller size text +\newenvironment{methods}{% + \begingroup +\def\section{% + \@startsection{section}{1}{\z@} + {-24\p@ plus -3\p@}{4\p@} + {\reset@font\raggedright\helveticabold\fontsize{10}{12}\selectfont\MakeUppercase}} + \def\subsection{% + \@startsection{subsection}{2}{\z@} + {-5\p@ plus -2\p@}{4\p@} + {\reset@font\raggedright\mathversion{bold}\fontseries{b}\fontsize{10}{12}\selectfont}} + \def\subsubsection{% + \@startsection{subsubsection}{3}{\z@} +% {-6\p@ plus -1\p@}{-1em} + {-6\p@ plus -1\p@}{0.001em} + {\reset@font\normalfont\normalsize\itshape}} +\footnotesize + \par} +{\par\endgroup\bigskip\@afterheading\@afterindentfalse} + + + +\graphicspath{{g:/artwork/oup/bioinfo/}} + +\language=2 + +\hyphenation{Figure Table Figures Tables} + +\newcommand{\href}[2]{#2} + +\renewenvironment{proof}[1][\proofname]{\par + \normalfont \topsep6\p@\@plus6\p@\relax + \labelsep 0.5em + \trivlist + \item[\hskip\labelsep\hskip1em\textsc{#1}.]\ignorespaces +}{\endtrivlist\@endpefalse} + +%%Different Bonds + +\def\sbond{\ensuremath{\raise.25ex\hbox{${-}\!\!\!\!{-}$}}\kern -.9pt} +\def\dbond{\ensuremath{\raise.25ex\hbox{=$\!$=}}} +\def\tbond{\ensuremath{\raise.20ex\hbox{${\equiv}\!\!\!{\equiv}$}}} + +% Author queries +%\fboxsep=4\p@ +%\fboxrule=0.5\p@ +\newcommand{\query}[2][0pt]{}% +% \marginpar{\vspace*{#1}% +% {\parbox{\marginparwidth}{% +% \raggedright\fontsize{6}{8}\selectfont +% #2}}}} + +\renewcommand{\dag}{{\mathversion{normal}$^{\dagger}$}} + +\endinput diff --git a/tex/blasr-mc.eval b/tex/blasr-mc.eval new file mode 100644 index 0000000..1c43148 --- /dev/null +++ b/tex/blasr-mc.eval @@ -0,0 +1,17 @@ +Q 60 32681 57 0.001744133 +Q 39 3 1 0.001774569 +Q 38 3 1 0.001804999 +Q 35 5 1 0.001835311 +Q 34 31 2 0.001894692 +Q 20 11 2 0.001955154 +Q 19 4 1 0.001985460 +Q 15 29 5 0.002136296 +Q 14 6 1 0.002166417 +Q 10 11 1 0.002196193 +Q 6 11 2 0.002256442 +Q 5 1 1 0.002286864 +Q 4 1 1 0.002317285 +Q 3 36 15 0.002771602 +Q 2 5 2 0.002832085 +Q 1 12 9 0.003105023 +Q 0 220 83 0.005594194 diff --git a/tex/bwa.eval b/tex/bwa.eval new file mode 100644 index 0000000..d61596f --- /dev/null +++ b/tex/bwa.eval @@ -0,0 +1,55 @@ +Q 60 31721 27 0.000851171 +Q 59 54 4 0.000975610 +Q 58 29 5 0.001131933 +Q 57 21 2 0.001194030 +Q 56 14 4 0.001319137 +Q 55 22 6 0.001506544 +Q 54 12 4 0.001631475 +Q 53 16 3 0.001724733 +Q 51 10 1 0.001755541 +Q 50 10 1 0.001786330 +Q 49 11 3 0.001879699 +Q 47 8 2 0.001941869 +Q 46 17 1 0.001972140 +Q 44 8 3 0.002065534 +Q 43 10 1 0.002096174 +Q 42 13 1 0.002126595 +Q 41 14 3 0.002219444 +Q 40 13 2 0.002281036 +Q 38 17 4 0.002404747 +Q 37 15 4 0.002528484 +Q 36 12 1 0.002558742 +Q 35 19 3 0.002650783 +Q 34 12 3 0.002743313 +Q 33 7 1 0.002773882 +Q 32 21 3 0.002865508 +Q 31 11 2 0.002926799 +Q 30 14 3 0.003018891 +Q 29 17 1 0.003048401 +Q 28 11 2 0.003109549 +Q 27 20 5 0.003262998 +Q 26 11 1 0.003292948 +Q 25 14 4 0.003415725 +Q 24 16 5 0.003569212 +Q 23 43 6 0.003750426 +Q 21 15 1 0.003779664 +Q 20 29 7 0.003992943 +Q 19 22 2 0.004052089 +Q 18 28 4 0.004172204 +Q 16 25 5 0.004323390 +Q 15 24 5 0.004474480 +Q 14 25 5 0.004625204 +Q 13 23 3 0.004714365 +Q 12 22 1 0.004741963 +Q 11 32 11 0.005075674 +Q 10 35 7 0.005285315 +Q 9 32 12 0.005648503 +Q 8 33 8 0.005888126 +Q 7 39 7 0.006095506 +Q 6 42 14 0.006515953 +Q 5 38 15 0.006966725 +Q 4 37 12 0.007325113 +Q 3 49 18 0.007862737 +Q 2 63 21 0.008486434 +Q 1 55 27 0.009292156 +Q 0 153 77 0.011576593 diff --git a/tex/eval2roc.pl b/tex/eval2roc.pl new file mode 100755 index 0000000..9c32e38 --- /dev/null +++ b/tex/eval2roc.pl @@ -0,0 +1,33 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Getopt::Std; + +my %opts = (n=>33088, s=>100); +getopts('n:', \%opts); + +my $pseudo = .5; +my $tot = $pseudo; +my $err = $pseudo; +my $tot_last_out = -$opts{s}; +my $state = 0; +my $mapq = 0; +while (<>) { + chomp; + if (/^Q\t(\d+)\t(\d+)\t(\d+)/) { + $tot += $2; + $err += $3; + if ($tot - $tot_last_out >= $opts{s}) { + print join("\t", $1, $err/$tot, $tot / $opts{n}), "\n"; + $tot_last_out = $tot; + $state = 0; + } else { + $state = 1; + $mapq = $1; + } + } +} +if ($state) { + print join("\t", $mapq, $err/$tot, $tot / $opts{n}), "\n"; +} diff --git a/tex/graphmap.eval b/tex/graphmap.eval new file mode 100644 index 0000000..8994281 --- /dev/null +++ b/tex/graphmap.eval @@ -0,0 +1,4 @@ +Q 40 31897 63 0.001975107 +Q 3 423 267 0.010210396 +Q 2 162 120 0.013853827 +Q 1 188 172 0.019038874 diff --git a/tex/hs38-simu.sh b/tex/hs38-simu.sh new file mode 100644 index 0000000..8fce7ac --- /dev/null +++ b/tex/hs38-simu.sh @@ -0,0 +1,4 @@ +./pbsim --prefix pb-1 --depth 0.1 --sample-fastq m131017_060208_42213_c100579642550000001823095604021496_s1_p0.1.subreads.fastq --length-min 1000 --length-max 30000 --seed 11 hs38.fa + +bin/mason_variator -ir hs38.fa -s 1 -ov hs38-s1.vcf --snp-rate 1e-3 --small-indel-rate 2e-4 --sv-indel-rate 0 --sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 --max-small-indel-size 10 +bin/mason_simulator -ir hs38.fa -iv hs38-s1.vcf -n 1000000 --seed 1 -o s1_1.fq -or s1_2.fq -oa s1.sam --illumina-prob-mismatch-scale 2.5 diff --git a/tex/minialign.eval b/tex/minialign.eval new file mode 100644 index 0000000..246ddb6 --- /dev/null +++ b/tex/minialign.eval @@ -0,0 +1,49 @@ +Q 60 32070 190 0.005924540 +Q 59 62 2 0.005975352 +Q 58 37 5 0.006123908 +Q 57 40 7 0.006333633 +Q 56 39 6 0.006512032 +Q 55 32 2 0.006567534 +Q 54 54 2 0.006618420 +Q 53 33 4 0.006735255 +Q 52 39 2 0.006788866 +Q 51 48 3 0.006871264 +Q 50 34 2 0.006925634 +Q 49 32 3 0.007011070 +Q 48 35 2 0.007064967 +Q 47 36 4 0.007179896 +Q 46 23 1 0.007205495 +Q 45 25 1 0.007230614 +Q 44 17 3 0.007318716 +Q 43 17 2 0.007376121 +Q 42 31 5 0.007522016 +Q 41 25 4 0.007638486 +Q 40 26 4 0.007754541 +Q 39 35 2 0.007807258 +Q 37 18 4 0.007924896 +Q 36 13 3 0.008013162 +Q 35 15 2 0.008070411 +Q 34 20 3 0.008156805 +Q 33 11 1 0.008184501 +Q 32 15 3 0.008272003 +Q 31 25 1 0.008296107 +Q 29 8 1 0.008324472 +Q 28 7 2 0.008383452 +Q 27 9 2 0.008441894 +Q 26 30 2 0.008494888 +Q 23 2 1 0.008524710 +Q 22 11 3 0.008612846 +Q 20 23 3 0.008697760 +Q 19 6 1 0.008726479 +Q 18 8 1 0.008754658 +Q 16 6 1 0.008783354 +Q 13 2 1 0.008813108 +Q 12 4 2 0.008872604 +Q 11 7 2 0.008931275 +Q 10 4 3 0.009021009 +Q 9 6 4 0.009140436 +Q 8 6 3 0.009229559 +Q 7 5 1 0.009258419 +Q 6 8 3 0.009346925 +Q 4 8 5 0.009495872 +Q 3 17 8 0.009732801 diff --git a/tex/minimap2.bib b/tex/minimap2.bib new file mode 100644 index 0000000..75ace65 --- /dev/null +++ b/tex/minimap2.bib @@ -0,0 +1,181 @@ +@article{Chaisson:2012aa, + Author = {Chaisson, Mark J and Tesler, Glenn}, + Journal = {BMC Bioinformatics}, + Pages = {238}, + Title = {{Mapping single molecule sequencing reads using basic local alignment with successive refinement (BLASR): application and theory}}, + Volume = {13}, + Year = {2012}} + +@article{Liu:2016ab, + Author = {Liu, Bo and others}, + Journal = {Bioinformatics}, + Pages = {1625-31}, + Title = {{rHAT}: fast alignment of noisy long reads with regional hashing}, + Volume = {32}, + Year = {2016}} + +@article{Liu:2017aa, + Author = {Liu, Bo and others}, + Journal = {Bioinformatics}, + Pages = {192-201}, + Title = {{LAMSA}: fast split read alignment with long approximate matches}, + Volume = {33}, + Year = {2017}} + +@article{Lin:2017aa, + Author = {Lin, Hsin-Nan and Hsu, Wen-Lian}, + Journal = {Bioinformatics}, + Title = {Kart: a divide-and-conquer algorithm for {NGS} read alignment}, + Year = {2017}} + +@article{Li:2013aa, + Author = {Li, Heng}, + Journal = {arXiv:1303.3997}, + Title = {Aligning sequence reads, clone sequences and assembly contigs with {BWA-MEM}}, + archivePrefix = "arXiv", + eprint = {1303.3997}, + primaryClass = "q-bio", + Year = {2013}} + +@article{Sovic:2016aa, + Author = {Sovi{\'c}, Ivan and others}, + Journal = {Nat Commun}, + Pages = {11307}, + Title = {Fast and sensitive mapping of nanopore sequencing reads with {GraphMap}}, + Volume = {7}, + Year = {2016}} + +@article{Langmead:2012fk, + Author = {Langmead, Ben and Salzberg, Steven L}, + Journal = {Nat Methods}, + Pages = {357-9}, + Title = {Fast gapped-read alignment with {Bowtie} 2}, + Volume = {9}, + Year = {2012}} + +@article{Li:2016aa, + Author = {Li, Heng}, + Journal = {Bioinformatics}, + Pages = {2103-10}, + Title = {Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences}, + Volume = {32}, + Year = {2016}} + +@misc{Suzuki:2016, + title = {Fast and accurate alignment tool for PacBio and Nanopore long reads}, + author = {Hajime Suzuki}, + journal = {Unpublished}, + howpublished = {\href{https://github.com/ocxtal/minialign}{https://github.com/ocxtal/minialign}}, + year = {2016}} + +@misc{Ruan:2016, + title = {Ultra-fast de novo assembler using long noisy reads}, + author = {Jue Ruan}, + journal = {Unpulished}, + howpublished = {\href{https://github.com/ruanjue/smartdenovo}{https://github.com/ruanjue/smartdenovo}}, + year = {2016}} + +@article{Miller:1988aa, + Author = {Miller, W and Myers, E W}, + Journal = {Bull Math Biol}, + Number = {2}, + Pages = {97-120}, + Title = {Sequence comparison with concave weighting functions}, + Volume = {50}, + Year = {1988}} + +@article{Gotoh:1990aa, + Author = {Gotoh, O}, + Journal = {Bull Math Biol}, + Pages = {359-73}, + Title = {Optimal sequence alignment allowing for long gaps}, + Volume = {52}, + Year = {1990}} + +@article{Wu:1996aa, + Author = {Wu, Sun and others}, + Journal = {Algorithmica}, + Pages = {50-67}, + Title = {A subquadratic algorithm for approximate limited expression matching}, + Volume = {15}, + Year = {1996}} + +@article{Daily:2016aa, + Author = {Daily, Jeff}, + Journal = {BMC Bioinformatics}, + Month = {Feb}, + Pages = {81}, + Title = {Parasail: {SIMD C} library for global, semi-global, and local pairwise sequence alignments}, + Volume = {17}, + Year = {2016}} + +@article{Sedlazeck169557, + author = {Sedlazeck, Fritz J and others}, + title = {Accurate detection of complex structural variations using single molecule sequencing}, + note = {doi:10.1101/169557}, + journal = {bioRxiv}, + year = {2017}} + +@article{Altschul:1997vn, + Author = {Altschul, S F and others}, + Journal = {Nucleic Acids Res}, + Pages = {3389-402}, + Title = {Gapped {BLAST} and {PSI-BLAST}: a new generation of protein database search programs}, + Volume = {25}, + Year = {1997}} + +@article{Sosic:2017aa, + Author = {{\v S}o{\v s}i\'{c}, Martin and {\v S}ikic, Mile}, + Journal = {Bioinformatics}, + Pages = {1394-1395}, + Title = {Edlib: a {C/C++} library for fast, exact sequence alignment using edit distance}, + Volume = {33}, + Year = {2017}} + +@article{Abouelhoda:2005aa, + Author = {Mohamed Ibrahim Abouelhoda and Enno Ohlebusch}, + Journal = {J. Discrete Algorithms}, + Pages = {321-41}, + Title = {Chaining algorithms for multiple genome comparison}, + Volume = {3}, + Year = {2005}} + +@article{Ono:2013aa, + Author = {Ono, Yukiteru and others}, + Journal = {Bioinformatics}, + Pages = {119-21}, + Title = {{PBSIM}: {PacBio} reads simulator--toward accurate genome assembly}, + Volume = {29}, + Year = {2013}} + +@article {Jain128835, + author = {Jain, Miten and others}, + title = {Nanopore sequencing and assembly of a human genome with ultra-long reads}, + year = {2017}, + note = {doi:10.1101/128835}, + publisher = {Cold Spring Harbor Labs Journals}, + journal = {bioRxiv}} + +@article{Lau:2016aa, + Author = {Lau, Bayo and others}, + Journal = {Bioinformatics}, + Pages = {3829-3832}, + Title = {{LongISLND}: in silico sequencing of lengthy and noisy datatypes}, + Volume = {32}, + Year = {2016}} + +@article{Robinson:2011aa, + Author = {Robinson, James T and others}, + Journal = {Nat Biotechnol}, + Pages = {24-6}, + Title = {Integrative genomics viewer}, + Volume = {29}, + Year = {2011}} + +@article {Suzuki130633, + author = {Suzuki, Hajime and Kasahara, Masahiro}, + title = {Acceleration Of Nucleotide Semi-Global Alignment With Adaptive Banded Dynamic Programming}, + year = {2017}, + note = {doi:10.1101/130633}, + publisher = {Cold Spring Harbor Labs Journals}, + journal = {bioRxiv}} diff --git a/tex/minimap2.tex b/tex/minimap2.tex new file mode 100644 index 0000000..cd20bf7 --- /dev/null +++ b/tex/minimap2.tex @@ -0,0 +1,344 @@ +\documentclass{bioinfo} +\copyrightyear{2017} +\pubyear{2017} + +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{url} +\usepackage{amsmath} +\usepackage[ruled,vlined]{algorithm2e} +\newcommand\mycommfont[1]{\footnotesize\rmfamily{\it #1}} +\SetCommentSty{mycommfont} +\SetKwComment{Comment}{$\triangleright$\ }{} + +\usepackage{natbib} +\bibliographystyle{apalike} +\usepackage{hyperref} + +\DeclareMathOperator*{\argmax}{argmax} + +\begin{document} +\firstpage{1} + +\title[Long DNA sequence alignment with minimap2]{Minimap2: fast pairwise alignment for long DNA sequences} +\author[Li]{Heng Li} +\address{Broad Institute, 415 Main Street, Cambridge, MA 02142, USA} + +\maketitle + +\begin{abstract} +\section{Summary:} Minimap2 is a general-purpose mapper to align long noisy DNA +sequences against a large reference database. It targets query sequences of +1kb--100Mb in length with per-base divergence typically below 25\%. Minimap2 is +$\sim$30 times faster than many mainstream long-read aligners and achieves +higher accuracy on simulated data. It also employs concave gap cost and rescues +inversions for improved alignment around potential structural variations. + +\section{Availability and implementation:} +\href{https://github.com/lh3/minimap2}{https://github.com/lh3/minimap2} + +\section{Contact:} hengli@broadinstitute.org +\end{abstract} + +\section{Introduction} + +Single Molecule Real-Time (SMRT) sequencing technology and Oxford Nanopore +technologies (ONT) produce reads over 10kbp in length at an error rate +$\sim$15\%. Several aligners have been developed for such +data~\citep{Chaisson:2012aa,Li:2013aa,Liu:2016ab,Sovic:2016aa,Liu:2017aa,Lin:2017aa,Sedlazeck169557}. +They are usually five times as slow as mainstream short-read +aligners~\citep{Langmead:2012fk,Li:2013aa}. We speculated there could be +substantial room for speedup on the thought that 10kb long sequences should be +easier to map than 100bp reads because we can more effectively skip repetitive +regions, which are often the bottleneck of short-read alignment. We confirmed +our speculation by achieving approximate mapping 50 times faster than +BWA-MEM~\citep{Li:2016aa}. \citet{Suzuki:2016} extended our work with a fast +and novel algorithm on generating detailed alignment, which in turn inspired us +to develop minimap2 towards higher accuracy and more practical functionality. + +\begin{methods} +\section{Methods} + +Minimap2 is the successor of minimap~\citep{Li:2016aa}. It uses similar +indexing and seeding algorithms, and furthers it with more accurate chaining +and the ability to produce detailed alignment. + +\subsection{Chaining} + +%\subsubsection{Chaining} +An \emph{anchor} is a 3-tuple $(x,y,w)$, indicating interval $[x-w+1,x]$ on the +reference matching interval $[y-w+1,y]$ on the query. Given a list of anchors +sorted by ending reference position $x$, let $f(i)$ be the maximal chaining +score up to the $i$-th anchor in the list. $f(i)$ can be calculated with +dynamic programming (DP): +\begin{equation}\label{eq:chain} +f(i)=\max\big\{\max_{i>j\ge 1} \{ f(j)+d(j,i)-\gamma(j,i) \},w_i\big\} +\end{equation} +where $d(j,i)=\min\big\{\min\{y_i-y_j,x_i-x_j\},w_i\big\}$ is the number of +matching bases between the two anchors. $\gamma(j,i)>0$ is the gap cost. It +equals $\infty$ if $y_j\ge y_i$ or $\max\{y_i-y_j,x_i-x_j\}>G$ (i.e. the +distance between two anchors is too large); otherwise +\[ +\gamma(j,i)=\gamma'(\max\{y_i-y_j,x_i-x_j\}-\min\{y_i-y_j,x_i-x_j\}) +\] +In implementation, a gap of length $l$ costs $\gamma'(l)=\alpha\cdot +l+\beta\log_2(l)$. For $m$ anchors, directly computing all $f(\cdot)$ with +Eq.~(\ref{eq:chain}) takes $O(m^2)$ time. Although theoretically faster +chaining algorithms exist~\citep{Abouelhoda:2005aa}, they +are inapplicable to generic gap cost, complex to implement and usually +associated with a large constant. We introduced a simple heuristic to +accelerate chaining. + +We note that if anchor $i$ is chained to $j$, chaining $i$ to a predecessor +of $j$ is likely to yield a lower score. When evaluating Eq.~(\ref{eq:chain}), +we start from anchor $i-1$ and stop the evaluation if we cannot find a better +score after up to $h$ iterations. This approach reduces the average time to +$O(h\cdot m)$. In practice, we can almost always find the optimal chain with +$h=50$; even if the heuristic fails, the optimal chain is often close. + +%\subsubsection{Backtracking} +For backtracking, let $P(i)$ be the index of the best predecessor of anchor +$i$. It equals 0 if $f(i)=w_i$ or $\argmax_j\{f(j)+\eta(j,i)-\gamma(j,i)\}$ +otherwise. For each anchor $i$ in the descending order of $f(i)$, we apply +$P(\cdot)$ repeatedly to find its predecessor and mark each visited $i$ as +`used', until $P(i)=0$ or we reach an already `used' $i$. This way we find all +chains with no anchors used in more than one chains. + +%\subsubsection{Identifying primary chains} +In the absence of copy number changes, each query segment should not be mapped +to two places in the reference. However, chains found at the previous step may +have significant or complete overlaps due to repeats in the reference. +Minimap2 used the following procedure to identify \emph{primary chains} that do +not greatly overlap on the query. Let $Q$ be an empty set initially. For each +chain from the best to the worst according to their chaining scores: if on the +query, the chain overlaps with a chain in $Q$ by 50\% or higher percentage of +the shorter chain, mark the chain as secondary to the chain in $Q$; otherwise, +add the chain to $Q$. In the end, $Q$ contains all the primary chains. We did +not choose a more sophisticated data structure (e.g. range tree or k-d tree) +because this step is not the performance bottleneck. + +\subsection{Alignment} + +Minimap2 performs global alignment between adjacent anchors in a chain. It +adopted difference-based formulation to derive +alignment~\citep{Wu:1996aa,Suzuki:2016}. When combined with SSE vectorization, +this formulation has two advantages. First, because each score in the DP matrix +is bounded by the gap cost and the maximum matching score, we can usually +achieve 16-way SSE vectorization regardless of the peak score of the +alignment. Second, filling the DP matrix along the diagonal, we can simplify +banded alignment, which is critical to performance. In practice, our +implementation is three times as fast as Parasail's 4-way +vectorization~\citep{Daily:2016aa} for global alignment. +Without banding, our implementation is slower than Edlib~\citep{Sosic:2017aa}, +but with a 1000bp band, it is considerably faster. When performing global +alignment between anchors, we expect the alignment to stay close to the +diagonal of the DP matrix. Banding is often applicable. + +Minimap2 uses a 2-piece affine gap cost +$\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\}$. +On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, this +cost function is concave. It applies cost $q+l\cdot e$ to gaps shorter than +$\lceil(\tilde{q}-q)/(e-\tilde{e})\rceil$ and applies +$\tilde{q}+l\cdot\tilde{e}$ to longer gaps. This scheme helps to recover +longer insertions and deletions~(INDEL; \citealp{Gotoh:1990aa}). + +With global alignment, minimap2 may force to align unrelated sequences between +two adjacent anchors. To avoid such an artifact, we compute accumulative +alignment score along the alignment path and break the alignment where the +score drops too fast in the diagonal direction. More precisely, let $S(i,j)$ be +the alignment score along the alignment path ending at cell $(i,j)$ in the DP +matrix. We break the alignment if there exist $(i',j')$ and $(i,j)$, $i'Z+e\cdot(\max\{i-i',j-j'\}-\min\{i-i',j-j'\}) +\] +where $e$ is the gap extension cost and $Z$ is an arbitrary threshold. +This strategy is similar to X-drop employed in BLAST~\citep{Altschul:1997vn}. +However, unlike X-drop, it would not break the alignment in the presence of a +single long gap. + +When minimap2 breaks a global alignment between two anchors, it performs local +alignment between the two subsequences involved in the global alignment, but +this time with the one subsequence reverse complemented. This additional +alignment step may identify short inversions that are missed during chaining. + +\end{methods} + +\section{Results} +\begin{figure}[!tb] +\centering +\includegraphics[width=.5\textwidth]{roc-color.pdf} +\caption{Evaluation on simulated SMRT reads aligned against human genome +GRCh38. (a) ROC-like curve. (b) Accumulative mapping error rate as a function +of mapping quality. 33,088 $\ge$1000bp reads were simulated using +pbsim~\citep{Ono:2013aa} with error profile sampled from file +`m131017\_060208\_42213\_*.1.*' downloaded at +\href{http://bit.ly/chm1p5c3}{http://bit.ly/chm1p5c3}. The N50 read length is +11,628. A read is considered correctly mapped if the true position overlaps +with the best mapping position by 10\% of the read length. All aligners were +run under the default setting for SMRT reads.}\label{fig:eval} +\end{figure} + +As a sanity check, we evaluated minimap2 on simulated human reads along with +BLASR~\citep{Chaisson:2012aa}, +BWA-MEM~\citep{Li:2013aa}, +GraphMap~\citep{Sovic:2016aa}, +minialign~\citep{Suzuki:2016} and +NGMLR~\citep{Sedlazeck169557}. We excluded rHAT~\citep{Liu:2016ab}, +LAMSA~\citep{Liu:2017aa} and Kart~\citep{Lin:2017aa} because they either +crashed or produced malformatted output. In this evaluation, Minimap2 has +higher power to distinguish unique and repetitive hits, and achieves overall +higher mapping accuracy (Fig.~\ref{fig:eval}a). It is still the most accurate +even if we skip DP-based alignment (data not shown), suggesting chaining alone +is sufficient to achieve high accuracy for approximate mapping. Minimap2 and +NGMLR provide better mapping quality estimate: they rarely give repetitive hits +high mapping quality (Fig.~\ref{fig:eval}b). Apparently, other aligners may +occasionally miss close suboptimal hits and be overconfident in wrong mappings. +On run time, minialign is slightly faster than minimap2. They are over 30 times +faster than the rest. Minimap2 consumed 6.1GB memory at the peak, more than +BWA-MEM but less than others. + +On real human SMRT reads, the relative performance and sensitivity of +these aligners are broadly similar to those on simulated data. We are unable to +provide a good estimate of mapping error rate due to the lack of the truth. On +ONT ultra-long human reads~\citep{Jain128835}, BWA-MEM failed. Minialign and +minimap2 are over 70 times faster than others. We have also examined tens of +$\ge$100bp INDELs in IGV~\citep{Robinson:2011aa} and can confirm the +observation by~\citet{Sedlazeck169557} that BWA-MEM often breaks them into +shorter gaps. Minimap2 does not have this issue. + +\section{Discussions} + +Minialign and minimap2 are fast because a) with chaining, they can quickly +filter out most false seed hits~\citep{Li:2016aa} and reduce unsuccessful but +costly DP-based alignments; b) they implemented so far the fastest DP-based +alignment algorithm for long sequences~\citep{Suzuki:2016}. It is possible to +further accelerate minimap2 with a few other tweaks such as adaptive +banding~\citep{Suzuki130633} or incremental banding. + +In addition to reference-based read mapping, minimap2 inherits minimap's +ability to search against huge multi-species databases and to find read +overlaps. On a few test data sets, minimap2 appears to yield slightly better +miniasm assembly. Minimap2 can also align closely related genomes, though it +would benefit from more thorough evaluations. Genome alignment is an intricate +topic. + +\section*{Acknowledgements} +We owe a debt of gratitude to Hajime Suzuki for releasing his masterpiece and +insightful notes before formal publication. We thank M. Schatz, P. Rescheneder +and F. Sedlazeck for pointing out the limitation of BWA-MEM. We are also +grateful to early minimap2 testers who have greatly helped to fix various +issues. + +\bibliography{minimap2} + +\pagebreak +\appendix + +\begin{methods} +\section*{Appendix} +A 2-piece gap cost function is +\[ +\gamma(l)=\min\{q+l\cdot e,\tilde{q}+l\cdot\tilde{e}\} +\] +Without losing generality, we assume $q+e\le\tilde{q}+\tilde{e}$. The equation +to compute the optimal alignment under such a gap cost is~\citep{Gotoh:1990aa} +\begin{equation}\label{eq:ae86} +\left\{\begin{array}{l} +H_{ij} = \max\{H_{i-1,j-1}+s(i,j),E_{ij},F_{ij},\tilde{E}_{ij},\tilde{F}_{ij}\}\\ +E_{i+1,j}= \max\{H_{ij}-q,E_{ij}\}-e\\ +F_{i,j+1}= \max\{H_{ij}-q,F_{ij}\}-e\\ +\tilde{E}_{i+1,j}= \max\{H_{ij}-\tilde{q},\tilde{E}_{ij}\}-\tilde{e}\\ +\tilde{F}_{i,j+1}= \max\{H_{ij}-\tilde{q},\tilde{F}_{ij}\}-\tilde{e} +\end{array}\right. +\end{equation} +where $s(i,j)$ is the score between the $i$-th reference base and $j$-th query +base. If we define +\[ +\left\{\begin{array}{ll} +u_{ij}\triangleq H_{ij}-H_{i-1,j} & v_{ij}\triangleq H_{ij}-H_{i,j-1} \\ +x_{ij}\triangleq E_{i+1,j}-H_{ij} & \tilde{x}_{ij}\triangleq \tilde{E}_{i+1,j}-\tilde{H}_{ij} \\ +y_{ij}\triangleq F_{i,j+1}-H_{ij} & \tilde{y}_{ij}\triangleq \tilde{F}_{i,j+1}-\tilde{H}_{ij} +\end{array}\right. +\] +we can transform Eq.~(\ref{eq:ae86}) to +\begin{equation}\label{eq:suzuki} +\left\{\begin{array}{lll} +z_{ij}&=&\max\{s(i,j),x_{i-1,j}+v_{i-1,j},y_{i,j-1}+u_{i,j-1},\\ +&&\tilde{x}_{i-1,j}+v_{i-1,j},\tilde{y}_{i,j-1}+u_{i,j-1}\}\\ +u_{ij}&=&z_{ij}-v_{i-1,j}\\ +v_{ij}&=&z_{ij}-u_{i,j-1}\\ +x_{ij}&=&\max\{0,x_{i-1,j}+v_{i-1,j}-z_{ij}+q\}-q-e\\ +y_{ij}&=&\max\{0,y_{i,j-1}+u_{i,j-1}-z_{ij}+q\}-q-e\\ +\tilde{x}_{ij}&=&\max\{0,\tilde{x}_{i-1,j}+v_{i-1,j}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{ij}&=&\max\{0,\tilde{y}_{i,j-1}+u_{i,j-1}-z_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation} +where $z_{ij}$ is a temporary variable that does not need to be stored. + +All values in Eq.~(\ref{eq:suzuki}) are bounded. To see that, +\[ +x_{ij}=E_{i+1,j}-H_{ij}=\max\{-q,E_{ij}-H_{ij}\}-e +\] +With $E_{ij}\le H_{ij}$, we have +\[ +-q-e\le x_{ij}\le\max\{-q,0\}-e=-e +\] +and similar inequations for $y_{ij}$, $\tilde{x}_{ij}$ and $\tilde{y}_{ij}$. +In addition, +\[ +u_{ij}=z_{ij}-v_{i-1,j}\ge\max\{x_{i-1,j},\tilde{x}_{i-1,j}\}\ge-q-e +\] +As the maximum value of $z_{ij}=H_{ij}-H_{i-1,j-1}$ is $M$, the maximal +matching score, we can derive +\[ +u_{ij}\le M-v_{i-1,j}\le M+q+e +\] +In conclusion, $x$ and $y$ are bounded by $[-q-e,-e]$, $\tilde{x}$ and $\tilde{y}$ by +$[-\tilde{q}-\tilde{e},-\tilde{e}]$, and $u$ and $v$ by $[-q-e,M+q+e]$. When +matching score and gap cost are small, each of them can be stored as a 8-bit +integer. This enables 16-way SSE vectorization regardless of the peak score of +the alignment. + +For a more efficient SSE implementation, we transform the row-column coordinate +to diagonal-anti-diagonal coordinate by letting $r\gets i+j$ and $t\gets i$. +Eq.~(\ref{eq:suzuki}) becomes: +\begin{equation*} +\left\{\begin{array}{lll} +z_{rt}&=&\max\{s(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t},\\ +&&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\ +u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ +v_{rt}&=&z_{rt}-u_{r-1,t}\\ +x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\ +y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ +\tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ +\tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e} +\end{array}\right. +\end{equation*} +In this formulation, cells with the same diagonal index $r$ are independent of +each other. This allows us to fully vectorize the computation of all cells on +the same anti-diagonal in one inner loop. + +On the condition that $q+e<\tilde{q}+\tilde{e}$ and $e>\tilde{e}$, the boundary +condition of this equation in the diagonal-anti-diagonal coordinate is +\[ +\left\{\begin{array}{l} +x_{r-1,-1}=y_{r-1,r}=-q-e\\ +\tilde{x}_{r-1,-1}=\tilde{y}_{r-1,r}=-\tilde{q}-\tilde{e}\\ +u_{r-1,r}=v_{r-1,-1}=\eta(r)\\ +\end{array}\right. +\] +where +\[ +\eta(r)=\left\{\begin{array}{ll} +-q-e & (r=0) \\ +-e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +r\cdot(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \\ +-\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) +\end{array}\right. +\] + +\citet{Suzuki:2016} first derived a similar set of equations under affine gap +cost but with different notations. +\end{methods} +\end{document} diff --git a/tex/mm2.approx.eval b/tex/mm2.approx.eval new file mode 100644 index 0000000..60b9c9f --- /dev/null +++ b/tex/mm2.approx.eval @@ -0,0 +1,30 @@ +Q 60 32066 0 0.000000000 +Q 40 32 1 0.000031155 +Q 38 19 1 0.000062272 +Q 36 11 1 0.000093376 +Q 35 32 1 0.000124378 +Q 33 15 1 0.000155400 +Q 32 58 1 0.000186145 +Q 27 11 1 0.000217095 +Q 26 80 1 0.000247494 +Q 21 19 2 0.000309186 +Q 20 16 1 0.000339936 +Q 19 19 1 0.000370622 +Q 18 22 2 0.000432099 +Q 17 37 5 0.000585751 +Q 15 24 2 0.000646930 +Q 14 18 3 0.000738939 +Q 13 30 6 0.000922821 +Q 12 18 1 0.000953054 +Q 11 29 2 0.001013638 +Q 10 30 1 0.001043393 +Q 9 20 5 0.001196099 +Q 8 25 8 0.001440348 +Q 7 28 6 0.001622830 +Q 6 35 12 0.001988132 +Q 5 34 12 0.002352725 +Q 4 29 8 0.002594865 +Q 3 36 14 0.003018937 +Q 2 46 15 0.003471482 +Q 1 69 36 0.004558162 +Q 0 167 94 0.007377173 diff --git a/tex/mm2.eval b/tex/mm2.eval new file mode 100644 index 0000000..8f60797 --- /dev/null +++ b/tex/mm2.eval @@ -0,0 +1,17 @@ +Q 60 32072 0 0.000000000 +Q 43 206 1 0.000030981 +Q 27 201 1 0.000061578 +Q 15 59 1 0.000092200 +Q 12 25 1 0.000122839 +Q 11 16 1 0.000153473 +Q 10 24 1 0.000184032 +Q 9 17 2 0.000245248 +Q 8 27 3 0.000336938 +Q 7 23 1 0.000367309 +Q 6 20 1 0.000397675 +Q 5 18 4 0.000519751 +Q 4 17 1 0.000550038 +Q 3 29 5 0.000702204 +Q 2 32 4 0.000823522 +Q 1 54 6 0.001004872 +Q 0 234 106 0.004202697 diff --git a/tex/natbib.bst b/tex/natbib.bst new file mode 100644 index 0000000..a679e1d --- /dev/null +++ b/tex/natbib.bst @@ -0,0 +1,1288 @@ +%% +%% This is file `natbib.bst', generated +%% on <1994/9/16> with the docstrip utility (2.2h). +%% +%% The original source files were: +%% +%% genbst.mbs (with options: `ay,nat,seq-lab,nm-rev,dt-beg,yr-par,vol-bf, +%% volp-com,etal-it') +%% ---------------------------------------- +%% *** Personal bib style, PWD *** +%% +%% (Here are the specifications of the source file) +%% \ProvidesFile{genbst.mbs}[1994/09/16 1.5 (PWD)] +%% For use with BibTeX version 0.99a or later +%% and with LaTeX 2.09 or 2e +%%------------------------------------------------------------------- +%% NOTICE: +%% This file may be used for non-profit purposes. +%% It may not be distributed in exchange for money, +%% other than distribution costs. +%% +%% The author provides it `as is' and does not guarantee it in any way. +%% +%% Copyright (C) 1994 Patrick W. Daly +%% Max-Planck-Institut f\"ur Aeronomie +%% Postfach 20 +%% D-37189 Katlenburg-Lindau +%% Germany +%% +%% E-mail: +%% SPAN-- nsp::linmpi::daly (note nsp also known as ecd1) +%% Internet-- daly@linmpi.dnet.gwdg.de +%%----------------------------------------------------------- +%% \CharacterTable +%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z +%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z +%% Digits \0\1\2\3\4\5\6\7\8\9 +%% Exclamation \! Double quote \" Hash (number) \# +%% Dollar \$ Percent \% Ampersand \& +%% Acute accent \' Left paren \( Right paren \) +%% Asterisk \* Plus \+ Comma \, +%% Minus \- Point \. Solidus \/ +%% Colon \: Semicolon \; Less than \< +%% Equals \= Greater than \> Question mark \? +%% Commercial at \@ Left bracket \[ Backslash \\ +%% Right bracket \] Circumflex \^ Underscore \_ +%% Grave accent \` Left brace \{ Vertical bar \| +%% Right brace \} Tilde \~} +%%--------------------------------------------------------------------- + % This is an author-year citation style bibliography. As such, it is + % non-standard LaTeX, and requires a special package file to function properly. + % Such a package is natbib.sty by Patrick W. Daly + % The form of the \bibitem entries is + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}... + % The essential feature is that the label (the part in brackets) consists + % of the author names, as they should appear in the citation, with the year + % in parentheses following. There must be no space before the opening + % parenthesis! + % With natbib v5.3, a full list of authors may also follow the year. + % In natbib.sty, it is possible to define the type of enclosures that is + % really wanted (brackets or parentheses), but in either case, there must + % be parentheses in the label. + % The \cite command functions as follows: + % \cite{key} ==>> Jones et al. (1990) + % \cite[]{key} ==>> (Jones et al., 1990) + % \cite[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \cite[e.g.][]{key} ==>> (e.g. Jones et al., 1990) + % \cite[e.g.][p. 32]{key} ==>> (e.g. Jones et al., p. 32) + % \citeauthor{key} Jones et al. + % \citefullauthor{key} Jones, Baker, and Smith + % \citeyear{key} 1990 +%%--------------------------------------------------------------------- + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label extra.label sort.label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {non.stop} +{ duplicate$ + "}" * add.period$ + #-1 #1 substring$ "." = +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * non.stop + { "\/}" * } + { "}" * } + if$ + } + if$ +} + +FUNCTION {bolden} +{ duplicate$ empty$ + { pop$ "" } + { "{\bf " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}{, jj}{, f.}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.names.ed} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{f.~}{vv~}{ll}{, jj}" + format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.in.editors} +{ editor empty$ + { "" } + { editor format.names.ed + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ + } + if$ +} + +FUNCTION {format.full.names} +{'s := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " " * "et~al." emphasize * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.key.full} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.key.full} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {editor.key.full} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.full + { type$ "proceedings" = + 'editor.key.full + 'author.key.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[" write$ + label write$ + ")" make.full.names * "]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {word.in} +{ "In " } + +FUNCTION {format.date} +{ year duplicate$ empty$ + { "empty year in " cite$ * "; set to ????" * warning$ + pop$ "????" } + 'skip$ + if$ + before.all 'output.state := + " (" swap$ * extra.label * ")" * +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + bolden + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ", " * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { word.in booktitle emphasize * } + { word.in format.in.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ + word.in + "\cite{" * crossref * "}" * +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + word.in + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + "\cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ + word.in + "\cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + howpublished output + address output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address output + new.sentence + organization output + publisher output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + organization address new.block.checkb + organization output + address output + format.edition output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + new.block + format.title output + new.block + howpublished output + new.block + note output + fin.entry +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + new.block + "Ph.D. thesis" format.thesis.type output.nonnull + school "school" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors output + editor format.key output + format.date "year" output.check + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address output + new.sentence + organization output + publisher output + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {format.lab.names} +{ 's := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ " " * "et~al." emphasize * } + { #2 < + 'skip$ + { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " " * "et~al." emphasize * } + { " and " * s #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {author.key.label} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.editor.key.label} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {editor.key.label} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ +} + +FUNCTION {calc.label} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.label + { type$ "proceedings" = + 'editor.key.label + 'author.key.label + if$ + } + if$ + "(" + * + year duplicate$ empty$ + { pop$ "????" } + { purify$ #-1 #4 substring$ } + if$ + * + 'label := +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr + "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" + format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { numnames #2 > nameptr #2 = and + { "zzzzzz" * #1 'namesleft := } + { t sortify * } + if$ + } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.sort} +{ editor empty$ + { key empty$ + { "to sort, need editor or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ calc.label + label sortify + " " + * + type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.sort + 'author.sort + if$ + } + if$ + #1 entry.max$ substring$ + 'sort.label := + sort.label + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { last.label next.extra } + +INTEGERS { last.extra.num } + +FUNCTION {initialize.extra.label.stuff} +{ #0 int.to.chr$ 'last.label := + "" 'next.extra := + #0 'last.extra.num := +} + +FUNCTION {forward.pass} +{ last.label label = + { last.extra.num #1 + 'last.extra.num := + last.extra.num int.to.chr$ 'extra.label := + } + { "a" chr.to.int$ 'last.extra.num := + "" 'extra.label := + label 'last.label := + } + if$ +} + +FUNCTION {reverse.pass} +{ next.extra "b" = + { "a" 'extra.label := } + 'skip$ + if$ + extra.label 'next.extra := + label extra.label * 'label := +} + +EXECUTE {initialize.extra.label.stuff} + +ITERATE {forward.pass} + +REVERSE {reverse.pass} + +FUNCTION {bib.sort.order} +{ sort.label + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {bib.sort.order} + +SORT + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{}" write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} +%% End of customized bst file + diff --git a/tex/natbib.sty b/tex/natbib.sty new file mode 100644 index 0000000..4c8c948 --- /dev/null +++ b/tex/natbib.sty @@ -0,0 +1,803 @@ +%% +%% This is file `natbib.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% natbib.dtx (with options: `package,all') +%% ============================================= +%% IMPORTANT NOTICE: +%% +%% This program can be redistributed and/or modified under the terms +%% of the LaTeX Project Public License Distributed from CTAN +%% archives in directory macros/latex/base/lppl.txt; either +%% version 1 of the License, or any later version. +%% +%% This is a generated file. +%% It may not be distributed without the original source file natbib.dtx. +%% +%% Full documentation can be obtained by LaTeXing that original file. +%% Only a few abbreviated comments remain here to describe the usage. +%% ============================================= +%% Copyright 1993-2000 Patrick W Daly +%% Max-Planck-Institut f\"ur Aeronomie +%% Max-Planck-Str. 2 +%% D-37191 Katlenburg-Lindau +%% Germany +%% E-mail: daly@linmpi.mpg.de +\NeedsTeXFormat{LaTeX2e}[1995/06/01] +\ProvidesPackage{natbib} + [2000/07/24 7.0a (PWD)] + % This package reimplements the LaTeX \cite command to be used for various + % citation styles, both author-year and numerical. It accepts BibTeX + % output intended for many other packages, and therefore acts as a + % general, all-purpose citation-style interface. + % + % With standard numerical .bst files, only numerical citations are + % possible. With an author-year .bst file, both numerical and + % author-year citations are possible. + % + % If author-year citations are selected, \bibitem must have one of the + % following forms: + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}... + % \bibitem[Jones et al., 1990]{key}... + % \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones + % et al.}{1990}]{key}... + % \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}... + % \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}... + % \bibitem[\protect\citename{Jones et al., }1990]{key}... + % \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}... + % + % This is either to be made up manually, or to be generated by an + % appropriate .bst file with BibTeX. + % Author-year mode || Numerical mode + % Then, \citet{key} ==>> Jones et al. (1990) || Jones et al. [21] + % \citep{key} ==>> (Jones et al., 1990) || [21] + % Multiple citations as normal: + % \citep{key1,key2} ==>> (Jones et al., 1990; Smith, 1989) || [21,24] + % or (Jones et al., 1990, 1991) || [21,24] + % or (Jones et al., 1990a,b) || [21,24] + % \cite{key} is the equivalent of \citet{key} in author-year mode + % and of \citep{key} in numerical mode + % Full author lists may be forced with \citet* or \citep*, e.g. + % \citep*{key} ==>> (Jones, Baker, and Williams, 1990) + % Optional notes as: + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.,][]{key} ==>> (e.g., Jones et al., 1990) + % \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34) + % (Note: in standard LaTeX, only one note is allowed, after the ref. + % Here, one note is like the standard, two make pre- and post-notes.) + % \citealt{key} ==>> Jones et al. 1990 + % \citealt*{key} ==>> Jones, Baker, and Williams 1990 + % \citealp{key} ==>> Jones et al., 1990 + % \citealp*{key} ==>> Jones, Baker, and Williams, 1990 + % Additional citation possibilities (both author-year and numerical modes) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Williams + % \citeyear{key} ==>> 1990 + % \citeyearpar{key} ==>> (1990) + % \citetext{priv. comm.} ==>> (priv. comm.) + % Note: full author lists depends on whether the bib style supports them; + % if not, the abbreviated list is printed even when full requested. + % + % For names like della Robbia at the start of a sentence, use + % \Citet{dRob98} ==>> Della Robbia (1998) + % \Citep{dRob98} ==>> (Della Robbia, 1998) + % \Citeauthor{dRob98} ==>> Della Robbia + % + % + % Citation aliasing is achieved with + % \defcitealias{key}{text} + % \citetalias{key} ==>> text + % \citepalias{key} ==>> (text) + % + % Defining the citation style of a given bib style: + % Use \bibpunct (in the preamble only) with 6 mandatory arguments: + % 1. opening bracket for citation + % 2. closing bracket + % 3. citation separator (for multiple citations in one \cite) + % 4. the letter n for numerical styles, s for superscripts + % else anything for author-year + % 5. punctuation between authors and date + % 6. punctuation between years (or numbers) when common authors missing + % One optional argument is the character coming before post-notes. It + % appears in square braces before all other arguments. May be left off. + % Example (and default) \bibpunct[, ]{(}{)}{;}{a}{,}{,} + % + % To make this automatic for a given bib style, named newbib, say, make + % a local configuration file, natbib.cfg, with the definition + % \newcommand{\bibstyle@newbib}{\bibpunct...} + % Then the \bibliographystyle{newbib} will cause \bibstyle@newbib to + % be called on THE NEXT LATEX RUN (via the aux file). + % + % Such preprogrammed definitions may be invoked in the text (preamble only) + % by calling \citestyle{newbib}. This is only useful if the style specified + % differs from that in \bibliographystyle. + % + % With \citeindextrue and \citeindexfalse, one can control whether the + % \cite commands make an automatic entry of the citation in the .idx + % indexing file. For this, \makeindex must also be given in the preamble. + % + % LaTeX2e Options: (for selecting punctuation) + % round - round parentheses are used (default) + % square - square brackets are used [option] + % curly - curly braces are used {option} + % angle - angle brackets are used