From 4c0713ee14c56e399eb3934c80dedc2322b9cbcf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Jul 2017 12:06:49 -0400 Subject: [PATCH 1/6] r235: optionally output tag cs in PAF cs encodes the query, the reference sequence and CIGAR. --- format.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- main.c | 8 ++++--- map.c | 5 ++++- minimap.h | 2 ++ mmpriv.h | 2 +- 5 files changed, 72 insertions(+), 7 deletions(-) diff --git a/format.c b/format.c index a8d681e..4ca7de6 100644 --- a/format.c +++ b/format.c @@ -1,7 +1,9 @@ #include #include #include +#include #include +#include "kalloc.h" #include "mmpriv.h" static inline void str_enlarge(kstring_t *s, int l) @@ -54,6 +56,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...) s->s[s->l] = 0; } +static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +{ + extern unsigned char seq_nt4_table[256]; + int i, q_off, t_off; + uint8_t *qseq, *tseq; + char *tmp; + if (r->p == 0) return; + mm_sprintf_lite(s, "\tcs:Z:"); + qseq = (uint8_t*)kmalloc(km, r->qe - r->qs); + tseq = (uint8_t*)kmalloc(km, r->re - r->rs); + tmp = (char*)kmalloc(km, r->re - r->rs > r->qe - r->qs? r->re - r->rs + 1 : r->qe - r->qs + 1); + mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq); + if (!r->rev) { + for (i = r->qs; i < r->qe; ++i) + qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; + } else { + for (i = r->qs; i < r->qe; ++i) + qseq[r->qe - i - 1] = seq_nt4_table[(uint8_t)t->seq[i]]; + } + for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { + int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; + assert(op >= 0 && op <= 2); + if (op == 0) { + int l_tmp = 0; + for (j = 0; j < len; ++j) { + if (qseq[q_off + j] != tseq[t_off + j]) { + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + } + mm_sprintf_lite(s, "*%c%c", "ACGTN"[tseq[t_off + j]], "ACGTN"[qseq[q_off + j]]); + } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]]; + } + if (l_tmp > 0) { + tmp[l_tmp] = 0; + mm_sprintf_lite(s, "=%s", tmp); + } + q_off += len, t_off += len; + } else if (op == 1) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "ACGTN"[qseq[q_off + j]]; + mm_sprintf_lite(s, "+%s", tmp); + q_off += len; + } else if (op == 2) { + for (j = 0, tmp[len] = 0; j < len; ++j) + tmp[j] = "ACGTN"[tseq[t_off + j]]; + mm_sprintf_lite(s, "-%s", tmp); + t_off += len; + } + } + assert(t_off == r->re - r->rs && q_off == r->qe - r->qs); + kfree(km, qseq); kfree(km, tseq); kfree(km, tmp); +} + static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; @@ -63,7 +119,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); } -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) { s->l = 0; mm_sprintf_lite(s, "%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); @@ -74,12 +130,14 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen); mm_sprintf_lite(s, "\t%d", r->mapq); write_tags(s, r); - if (r->p) { + if (r->p && (opt_flag & MM_F_OUT_CG)) { uint32_t k; mm_sprintf_lite(s, "\tcg:Z:"); for (k = 0; k < r->p->n_cigar; ++k) mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MID"[r->p->cigar[k]&0xf]); } + if (r->p && (opt_flag & MM_F_OUT_CS)) + write_cs(km, s, mi, t, r); } static char comp_tab[] = { diff --git a/main.c b/main.c index c8650d0..e536bbc 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r232" +#define MM_VERSION "2.0rc1-r235-dirty" void liftrlimit() { @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -67,7 +67,8 @@ int main(int argc, char *argv[]) else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); - else if (c == 'c') opt.flag |= MM_F_CIGAR; + else if (c == 'c') opt.flag |= MM_F_OUT_CG | MM_F_CIGAR; + else if (c == 'S') opt.flag |= MM_F_OUT_CS | MM_F_CIGAR; else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF; else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR; else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; @@ -166,6 +167,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); fprintf(stderr, " -c output CIGAR in PAF\n"); + fprintf(stderr, " -S output the cs tag in PAF\n"); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(stderr, " -K NUM minibatch size [200M]\n"); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); diff --git a/map.c b/map.c index bfb5d3d..4ea111e 100644 --- a/map.c +++ b/map.c @@ -338,16 +338,18 @@ static void *worker_pipeline(void *shared, int step, void *in) kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); return in; } else if (step == 2) { // step 2: output + void *km = 0; step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); + if ((p->opt->flag & MM_F_OUT_CS) && !(mm_dbg_flag & MM_DBG_NO_KALLOC)) km = km_init(); for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, r, s->n_reg[i], s->reg[i]); - else mm_write_paf(&p->str, mi, t, r); + else mm_write_paf(&p->str, mi, t, r, km, p->opt->flag); puts(p->str.s); } if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { @@ -360,6 +362,7 @@ static void *worker_pipeline(void *shared, int step, void *in) if (s->seq[i].qual) free(s->seq[i].qual); } free(s->reg); free(s->n_reg); free(s->seq); + km_destroy(km); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq); free(s); diff --git a/minimap.h b/minimap.h index 27eb3b4..076017f 100644 --- a/minimap.h +++ b/minimap.h @@ -12,6 +12,8 @@ #define MM_F_CIGAR 0x04 #define MM_F_OUT_SAM 0x08 #define MM_F_NO_QUAL 0x10 +#define MM_F_OUT_CG 0x20 +#define MM_F_OUT_CS 0x40 #define MM_IDX_MAGIC "MMI\2" diff --git a/mmpriv.h b/mmpriv.h index dcb78d6..32b3fd1 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -40,7 +40,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); -void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r); +void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs); int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int64_t n, mm128_t *a, uint64_t **_u, void *km); mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); From 35f232c3fa17837332a474bc6dd5ecbf9e476f83 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Jul 2017 12:17:48 -0400 Subject: [PATCH 2/6] r236: in cs tag, output differences in lowercase for easy eyeballing --- format.c | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/format.c b/format.c index 4ca7de6..ac25106 100644 --- a/format.c +++ b/format.c @@ -86,7 +86,7 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ tmp[l_tmp] = 0; mm_sprintf_lite(s, "=%s", tmp); } - mm_sprintf_lite(s, "*%c%c", "ACGTN"[tseq[t_off + j]], "ACGTN"[qseq[q_off + j]]); + mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]); } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]]; } if (l_tmp > 0) { @@ -96,12 +96,12 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ q_off += len, t_off += len; } else if (op == 1) { for (j = 0, tmp[len] = 0; j < len; ++j) - tmp[j] = "ACGTN"[qseq[q_off + j]]; + tmp[j] = "acgtn"[qseq[q_off + j]]; mm_sprintf_lite(s, "+%s", tmp); q_off += len; } else if (op == 2) { for (j = 0, tmp[len] = 0; j < len; ++j) - tmp[j] = "ACGTN"[tseq[t_off + j]]; + tmp[j] = "acgtn"[tseq[t_off + j]]; mm_sprintf_lite(s, "-%s", tmp); t_off += len; } diff --git a/main.c b/main.c index e536bbc..3a62613 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r235-dirty" +#define MM_VERSION "2.0rc1-r236-dirty" void liftrlimit() { From cd105b47f2ffea91a9bfe407affcaee97eaaf004 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Jul 2017 14:49:39 -0400 Subject: [PATCH 3/6] r237: fixed a bug in outputting cs:Z --- format.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/format.c b/format.c index ac25106..a84c735 100644 --- a/format.c +++ b/format.c @@ -85,6 +85,7 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ if (l_tmp > 0) { tmp[l_tmp] = 0; mm_sprintf_lite(s, "=%s", tmp); + l_tmp = 0; } mm_sprintf_lite(s, "*%c%c", "acgtn"[tseq[t_off + j]], "acgtn"[qseq[q_off + j]]); } else tmp[l_tmp++] = "ACGTN"[qseq[q_off + j]]; diff --git a/main.c b/main.c index 3a62613..f40c00a 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r236-dirty" +#define MM_VERSION "2.0rc1-r237-dirty" void liftrlimit() { From 12cea727b8ebb21714b62f7c3f5265fe1e3c2d29 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 1 Aug 2017 10:33:21 -0400 Subject: [PATCH 4/6] r238: bugfix to cs - rev sequence not complemented --- format.c | 6 ++++-- main.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/format.c b/format.c index a84c735..fd51c00 100644 --- a/format.c +++ b/format.c @@ -72,8 +72,10 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ for (i = r->qs; i < r->qe; ++i) qseq[i - r->qs] = seq_nt4_table[(uint8_t)t->seq[i]]; } else { - for (i = r->qs; i < r->qe; ++i) - qseq[r->qe - i - 1] = seq_nt4_table[(uint8_t)t->seq[i]]; + for (i = r->qs; i < r->qe; ++i) { + uint8_t c = seq_nt4_table[(uint8_t)t->seq[i]]; + qseq[r->qe - i - 1] = c >= 4? 4 : 3 - c; + } } for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) { int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4; diff --git a/main.c b/main.c index f40c00a..7244265 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0rc1-r237-dirty" +#define MM_VERSION "2.0rc1-r238-dirty" void liftrlimit() { From 71f82759af86522ecc8fd7818b978fb473350539 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Aug 2017 13:05:46 -0400 Subject: [PATCH 5/6] convert paf to maf --- misc/paf2aln.js | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 misc/paf2aln.js diff --git a/misc/paf2aln.js b/misc/paf2aln.js new file mode 100644 index 0000000..cc48ea4 --- /dev/null +++ b/misc/paf2aln.js @@ -0,0 +1,125 @@ +var getopt = function(args, ostr) { + var oli; // option letter list index + if (typeof(getopt.place) == 'undefined') + getopt.ind = 0, getopt.arg = null, getopt.place = -1; + if (getopt.place == -1) { // update scanning pointer + if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { + getopt.place = -1; + return null; + } + if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" + ++getopt.ind; + getopt.place = -1; + return null; + } + } + var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity + if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { + if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. + if (getopt.place < 0) ++getopt.ind; + return '?'; + } + if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument + getopt.arg = null; + if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; + } else { // need an argument + if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) + getopt.arg = args[getopt.ind].substr(getopt.place); + else if (args.length <= ++getopt.ind) { // no arg + getopt.place = -1; + if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; + return '?'; + } else getopt.arg = args[getopt.ind]; // white space + getopt.place = -1; + ++getopt.ind; + } + return optopt; +} + +var c, maf_out = false, line_len = 0; +while ((c = getopt(arguments, "ml:")) != null) { + if (c == 'm') maf_out = true; + else if (c == 'l') line_len = parseInt(getopt.arg); // TODO: not implemented yet +} + +if (getopt.ind == arguments.length) { + print("Usage: k8 paf2aln.js [options] "); + exit(1); +} + +function padding_str(x, len, right) +{ + var s = x.toString(); + if (s.length < len) { + if (right) s += Array(len - s.length + 1).join(" "); + else s = Array(len - s.length + 1).join(" ") + s; + } + return s; +} + +var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); +var re = /([=\-\+\*])([A-Za-z]+)/g; + +var buf = new Bytes(); +var file = new File(arguments[getopt.ind]); +if (maf_out) print("##maf version=1\n"); +while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + var t = line.split("\t", 12); + if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue; + var cs = m[1]; + s_ref.length = s_qry.length = s_mid.length = 0; + if (line_len == 0 || maf_out) { + var ql = 0, tl = 0, al = 0; + var n_mm = 0, n_gap = 0; + while ((m = re.exec(cs)) != null) { + var l = m[2].length; + if (m[1] == '=') { + s_ref.set(m[2]); + s_qry.set(m[2]); + s_mid.set(Array(l+1).join("|")); + } else if (m[1] == '*') { + s_ref.set(m[2].charAt(0)); + s_qry.set(m[2].charAt(1)); + s_mid.set(' '); + ++n_mm; + } else if (m[1] == '+') { + s_ref.set(Array(l+1).join("-")); + s_qry.set(m[2]); + s_mid.set(Array(l+1).join(" ")); + n_gap += l; + } else if (m[1] == '-') { + s_ref.set(m[2]); + s_qry.set(Array(l+1).join("-")); + s_mid.set(Array(l+1).join(" ")); + n_gap += l; + } + } + if (maf_out) { + var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; + var len = t[0].length > t[5].length? t[0].length : t[5].length; + print("a " + score); + print(["s", padding_str(t[5], len, true), padding_str(t[7], 10, false), padding_str(parseInt(t[8]) - parseInt(t[7]), 10, false), + "+", padding_str(t[6], 10, false), s_ref.toString()].join(" ")); + var qs, qe, ql = parseInt(t[1]); + if (t[4] == '+') { + qs = parseInt(t[2]); + qe = parseInt(t[3]); + } else { + qs = ql - parseInt(t[3]); + qe = ql - parseInt(t[2]); + } + print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false), + t[4], padding_str(ql, 10, false), s_qry.toString()].join(" ")); + print(""); + } else { + print(line); + print(n_mm, n_gap); + print(s_ref); print(s_mid); print(s_qry); + } + } +} +file.close(); +buf.destroy(); + +s_ref.destroy(); s_qry.destroy(); s_mid.destroy(); From fd66ab5413f67223f993421ce214395b8d5d362f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Aug 2017 14:31:21 -0400 Subject: [PATCH 6/6] blast-like output working --- misc/paf2aln.js | 108 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/misc/paf2aln.js b/misc/paf2aln.js index cc48ea4..eea4e6d 100644 --- a/misc/paf2aln.js +++ b/misc/paf2aln.js @@ -36,14 +36,20 @@ var getopt = function(args, ostr) { return optopt; } -var c, maf_out = false, line_len = 0; +var c, maf_out = false, line_len = 80; while ((c = getopt(arguments, "ml:")) != null) { if (c == 'm') maf_out = true; else if (c == 'l') line_len = parseInt(getopt.arg); // TODO: not implemented yet } +if (line_len == 0) line_len = 0x7fffffff; if (getopt.ind == arguments.length) { print("Usage: k8 paf2aln.js [options] "); + print("Options:"); + print(" -m MAF output (BLAST-like output by default)"); + print(" -l INT line length in BLAST-like output [80]"); + print(""); + print("Note: this script only works when minimap2 is run with option '-S'"); exit(1); } @@ -57,6 +63,42 @@ function padding_str(x, len, right) return s; } +function update_aln(s_ref, s_qry, s_mid, type, seq, slen) +{ + var l = type == '*'? 1 : seq.length; + if (type == '=') { + s_ref.set(seq); + s_qry.set(seq); + s_mid.set(Array(l+1).join("|")); + slen[0] += l, slen[1] += l; + } else if (type == '*') { + s_ref.set(seq.charAt(0)); + s_qry.set(seq.charAt(1)); + s_mid.set(' '); + slen[0] += 1, slen[1] += 1; + } else if (type == '+') { + s_ref.set(Array(l+1).join("-")); + s_qry.set(seq); + s_mid.set(Array(l+1).join(" ")); + slen[1] += l; + } else if (type == '-') { + s_ref.set(seq); + s_qry.set(Array(l+1).join("-")); + s_mid.set(Array(l+1).join(" ")); + slen[0] += l; + } +} + +function print_aln(rs, qs, strand, slen, elen, s_ref, s_qry, s_mid) +{ + print(["Ref+:", padding_str(rs + slen[0] + 1, 10, false), s_ref.toString(), padding_str(rs + elen[0], 10, true)].join(" ")); + print(" " + s_mid.toString()); + var st, en; + if (strand == '+') st = qs + slen[1] + 1, en = qs + elen[1]; + else st = qs - slen[1], en = qs - elen[1] + 1; + print(["Qry" + strand + ":", padding_str(st, 10, false), s_qry.toString(), padding_str(en , 10, true)].join(" ")); +} + var s_ref = new Bytes(), s_qry = new Bytes(), s_mid = new Bytes(); var re = /([=\-\+\*])([A-Za-z]+)/g; @@ -69,32 +111,10 @@ while (file.readline(buf) >= 0) { if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue; var cs = m[1]; s_ref.length = s_qry.length = s_mid.length = 0; - if (line_len == 0 || maf_out) { - var ql = 0, tl = 0, al = 0; - var n_mm = 0, n_gap = 0; - while ((m = re.exec(cs)) != null) { - var l = m[2].length; - if (m[1] == '=') { - s_ref.set(m[2]); - s_qry.set(m[2]); - s_mid.set(Array(l+1).join("|")); - } else if (m[1] == '*') { - s_ref.set(m[2].charAt(0)); - s_qry.set(m[2].charAt(1)); - s_mid.set(' '); - ++n_mm; - } else if (m[1] == '+') { - s_ref.set(Array(l+1).join("-")); - s_qry.set(m[2]); - s_mid.set(Array(l+1).join(" ")); - n_gap += l; - } else if (m[1] == '-') { - s_ref.set(m[2]); - s_qry.set(Array(l+1).join("-")); - s_mid.set(Array(l+1).join(" ")); - n_gap += l; - } - } + var slen = [0, 0], elen = [0, 0]; + if (maf_out) { + while ((m = re.exec(cs)) != null) + update_aln(s_ref, s_qry, s_mid, m[1], m[2], elen); if (maf_out) { var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; var len = t[0].length > t[5].length? t[0].length : t[5].length; @@ -112,11 +132,37 @@ while (file.readline(buf) >= 0) { print(["s", padding_str(t[0], len, true), padding_str(qs, 10, false), padding_str(qe - qs, 10, false), t[4], padding_str(ql, 10, false), s_qry.toString()].join(" ")); print(""); - } else { - print(line); - print(n_mm, n_gap); - print(s_ref); print(s_mid); print(s_qry); } + } else { + line = line.replace(/\tc[sg]:Z:\S+/g, ""); + print('>' + line); + var rs = parseInt(t[7]), qs = t[4] == '+'? parseInt(t[2]) : parseInt(t[3]); + var n_blocks = 0; + while ((m = re.exec(cs)) != null) { + var start = 0, rest = m[1] == '*'? 1 : m[2].length; + while (rest > 0) { + var l_proc; + if (s_ref.length + rest >= line_len) { + l_proc = line_len - s_ref.length; + update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen); + if (n_blocks > 0) print(""); + print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid); + ++n_blocks; + s_ref.length = s_qry.length = s_mid.length = 0; + slen[0] = elen[0], slen[1] = elen[1]; + } else { + l_proc = rest; + update_aln(s_ref, s_qry, s_mid, m[1], m[1] == '*'? m[2] : m[2].substr(start, l_proc), elen); + } + rest -= l_proc, start += l_proc; + } + } + if (s_ref.length > 0) { + if (n_blocks > 0) print(""); + print_aln(rs, qs, t[4], slen, elen, s_ref, s_qry, s_mid); + ++n_blocks; + } + print("//"); } } file.close();