diff --git a/misc/README.md b/misc/README.md index 3f7e728..8c0048b 100644 --- a/misc/README.md +++ b/misc/README.md @@ -125,7 +125,7 @@ k8 ov-eval.js reads-to-ref.paf ovlp.paf ## Calling Variants from Haploid Assemblies -Script [paf2diff.js](paf2diff.js) calls variants from coordinate-sorted +Command `paftools.js call` calls variants from coordinate-sorted assembly-to-reference alignment. It calls variants from the [cs tag][cs] and identifies confident/callable regions as those covered by exactly one contig. Here are example command lines: @@ -133,7 +133,7 @@ Here are example command lines: ```sh minimap2 -cx asm5 -t8 --cs ref.fa asm.fa > asm.paf # keeping this file is recommended; --cs required! sort -k6,6 -k8,8n asm.paf > asm.srt.paf # sort by reference start coordinate -k8 paf2diff.js asm.srt.paf > asm.var.txt +k8 paftools.js call asm.srt.paf > asm.var.txt ``` Here is sample output: diff --git a/misc/gff2bed.js b/misc/gff2bed.js deleted file mode 100755 index 7507ec9..0000000 --- a/misc/gff2bed.js +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env k8 - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -var c, fn_ucsc_fai = null, is_short = false; -while ((c = getopt(arguments, "u:s")) != null) { - if (c == 'u') fn_ucsc_fai = getopt.arg; - else if (c == 's') is_short = true; -} - -if (getopt.ind == arguments.length) { - print("Usage: k8 gff2bed.js [-u ucsc-genome.fa.fai] "); - exit(1); -} - -var ens2ucsc = {}; -if (fn_ucsc_fai != null) { - var buf = new Bytes(); - var file = new File(fn_ucsc_fai); - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - var s = t[0]; - if (/_(random|alt|decoy)$/.test(s)) { - s = s.replace(/_(random|alt|decoy)$/, ''); - s = s.replace(/^chr\S+_/, ''); - } else { - s = s.replace(/^chrUn_/, ''); - } - s = s.replace(/v(\d+)/, ".$1"); - if (s != t[0]) ens2ucsc[s] = t[0]; - } - file.close(); - buf.destroy(); -} - -var colors = { - 'protein_coding':'0,128,255', - 'lincRNA':'0,192,0', - 'snRNA':'0,192,0', - 'miRNA':'0,192,0', - 'misc_RNA':'0,192,0' - }; - -function print_bed12(exons, cds_st, cds_en, is_short) -{ - if (exons.length == 0) return; - var name = is_short? exons[0][7] + "|" + exons[0][5] : exons[0].slice(4, 7).join("|"); - var a = exons.sort(function(a,b) {return a[1]-b[1]}); - var sizes = [], starts = [], st, en; - st = a[0][1]; - en = a[a.length - 1][2]; - if (cds_st == 1<<30) cds_st = st; - if (cds_en == 0) cds_en = en; - if (cds_st < st || cds_en > en) - throw Error("inconsistent thick start or end for transcript " + a[0][4]); - for (var i = 0; i < a.length; ++i) { - sizes.push(a[i][2] - a[i][1]); - starts.push(a[i][1] - st); - } - var color = colors[a[0][5]]; - if (color == null) color = '196,196,196'; - print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ","); -} - -var re_gtf = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name) "([^"]+)";/g; -var re_gff3 = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name)=([^;]+)/g; -var buf = new Bytes(); -var file = arguments[getopt.ind] == '-'? new File() : new File(arguments[getopt.ind]); - -var exons = [], cds_st = 1<<30, cds_en = 0, last_id = null; -while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - if (t[0].charAt(0) == '#') continue; - if (t[2] != "CDS" && t[2] != "exon") continue; - t[3] = parseInt(t[3]) - 1; - t[4] = parseInt(t[4]); - var id = null, type = "", gname = "N/A", biotype = "", m, tname = "N/A"; - while ((m = re_gtf.exec(t[8])) != null) { - if (m[1] == "transcript_id") id = m[2]; - else if (m[1] == "transcript_type") type = m[2]; - else if (m[1] == "transcript_biotype") biotype = m[2]; - else if (m[1] == "gene_name") name = m[2]; - else if (m[1] == "transcript_name") tname = m[2]; - } - while ((m = re_gff3.exec(t[8])) != null) { - if (m[1] == "transcript_id") id = m[2]; - else if (m[1] == "transcript_type") type = m[2]; - else if (m[1] == "transcript_biotype") biotype = m[2]; - else if (m[1] == "gene_name") name = m[2]; - else if (m[1] == "transcript_name") tname = m[2]; - } - if (type == "" && biotype != "") type = biotype; - if (id == null) throw Error("No transcript_id"); - if (id != last_id) { - print_bed12(exons, cds_st, cds_en, is_short); - exons = [], cds_st = 1<<30, cds_en = 0; - last_id = id; - } - if (t[2] == "CDS") { - cds_st = cds_st < t[3]? cds_st : t[3]; - cds_en = cds_en > t[4]? cds_en : t[4]; - } else if (t[2] == "exon") { - if (fn_ucsc_fai != null) { - if (ens2ucsc[t[0]] != null) - t[0] = ens2ucsc[t[0]]; - else if (/^[A-Z]+\d+\.\d+$/.test(t[0])) - t[0] = t[0].replace(/([A-Z]+\d+)\.(\d+)/, "chrUn_$1v$2"); - } - exons.push([t[0], t[3], t[4], t[6], id, type, name, tname]); - } -} -if (last_id != null) - print_bed12(exons, cds_st, cds_en, is_short); - -file.close(); -buf.destroy(); diff --git a/misc/mapstat.js b/misc/mapstat.js deleted file mode 100755 index 4a11851..0000000 --- a/misc/mapstat.js +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env k8 - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -var c, gap_out_len = null; -while ((c = getopt(arguments, "l:")) != null) - if (c == 'l') gap_out_len = parseInt(getopt.arg); - -if (getopt.ind == arguments.length) { - print("Usage: k8 mapstat.js [-l gapOutLen] |"); - exit(1); -} - -var buf = new Bytes(); -var file = new File(arguments[getopt.ind]); -var re = /(\d+)([MIDSHNX=])/g; - -var lineno = 0, n_pri = 0, n_2nd = 0, n_seq = 0, n_cigar_64k = 0, l_tot = 0, l_cov = 0; -var n_gap = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]; - -function cov_len(regs) -{ - regs.sort(function(a,b) {return a[0]-b[0]}); - var st = regs[0][0], en = regs[0][1], l = 0; - for (var i = 1; i < regs.length; ++i) { - if (regs[i][0] < en) - en = en > regs[i][1]? en : regs[i][1]; - else l += en - st, st = regs[i][0], en = regs[i][1]; - } - l += en - st; - return l; -} - -var last = null, last_qlen = null, regs = []; -while (file.readline(buf) >= 0) { - var line = buf.toString(); - ++lineno; - if (line.charAt(0) != '@') { - var t = line.split("\t", 12); - var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null; - var atlen = null, aqlen, qs, qe, mapq, ori_qlen; - if (t[4] == '+' || t[4] == '-') { // PAF - if (!/\ts2:i:\d+/.test(line)) { - ++n_2nd; - continue; - } - if ((m = /\tcg:Z:(\S+)/.exec(line)) != null) - cigar = m[1]; - if (cigar == null) { - warn("WARNING: no CIGAR at line " + lineno); - continue; - } - tname = t[5]; - qs = parseInt(t[2]), qe = parseInt(t[3]); - aqlen = qe - qs; - is_rev = t[4] == '+'? false : true; - rs = parseInt(t[7]); - atlen = parseInt(t[8]) - rs; - mapq = parseInt(t[11]); - ori_qlen = parseInt(t[1]); - } else { // SAM - var flag = parseInt(t[1]); - if ((flag & 4) || t[2] == '*' || t[5] == '*') continue; - if (flag & 0x100) { - ++n_2nd; - continue; - } - cigar = t[5]; - tname = t[2]; - rs = parseInt(t[3]) - 1; - mapq = parseInt(t[4]); - aqlen = t[9].length; - is_sam = true; - is_rev = !!(flag&0x10); - } - ++n_pri; - if (last != t[0]) { - if (last != null) { - l_tot += last_qlen; - l_cov += cov_len(regs); - } - regs = []; - ++n_seq, last = t[0]; - } - var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0; - while ((m = re.exec(cigar)) != null) { - var l = parseInt(m[1]); - ++n_cigar; - if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') { - tl += l, ql += l, M += l; - } else if (m[2] == 'I' || m[2] == 'D') { - var type; - if (l < 50) type = 0; - else if (l < 100) type = 1; - else if (l < 300) type = 2; - else if (l < 400) type = 3; - else if (l < 1000) type = 4; - else type = 5; - if (m[2] == 'I') ql += l, ++n_gap[0][type]; - else tl += l, ++n_gap[1][type]; - if (gap_out_len != null && l >= gap_out_len) - print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l); - } else if (m[2] == 'N') { - tl += l; - } else if (m[2] == 'S') { - clip[M == 0? 0 : 1] = l, sclip += l; - } else if (m[2] == 'H') { - clip[M == 0? 0 : 1] = l; - } - } - if (n_cigar > 65535) ++n_cigar_64k; - if (ql + sclip != aqlen) - warn("WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + (ql+sclip) + " != " + aqlen + ")"); - if (atlen != null && atlen != tl) - warn("WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno); - if (is_sam) { - qs = clip[is_rev? 1 : 0], qe = qs + ql; - ori_qlen = clip[0] + ql + clip[1]; - } - regs.push([qs, qe]); - last_qlen = ori_qlen; - } -} -l_tot += last_qlen; -l_cov += cov_len(regs); - -file.close(); -buf.destroy(); - -if (gap_out_len == null) { - print("Number of mapped sequences: " + n_seq); - print("Number of primary alignments: " + n_pri); - print("Number of secondary alignments: " + n_2nd); - print("Number of primary alignments with >65535 CIGAR operations: " + n_cigar_64k); - print("Number of bases in mapped sequences: " + l_tot); - print("Number of mapped bases: " + l_cov); - print("Number of insertions in [0,50): " + n_gap[0][0]); - print("Number of insertions in [50,100): " + n_gap[0][1]); - print("Number of insertions in [100,300): " + n_gap[0][2]); - print("Number of insertions in [300,400): " + n_gap[0][3]); - print("Number of insertions in [400,1000): " + n_gap[0][4]); - print("Number of insertions in [1000,inf): " + n_gap[0][5]); - print("Number of deletions in [0,50): " + n_gap[1][0]); - print("Number of deletions in [50,100): " + n_gap[1][1]); - print("Number of deletions in [100,300): " + n_gap[1][2]); - print("Number of deletions in [300,400): " + n_gap[1][3]); - print("Number of deletions in [400,1000): " + n_gap[1][4]); - print("Number of deletions in [1000,inf): " + n_gap[1][5]); -} diff --git a/misc/paf2diff.js b/misc/paf2diff.js deleted file mode 100755 index 7aeef9a..0000000 --- a/misc/paf2diff.js +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env k8 - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g; -var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50, min_mapq = 5; -while ((c = getopt(arguments, "l:L:g:q:")) != null) { - if (c == 'l') min_cov_len = parseInt(getopt.arg); - else if (c == 'L') min_var_len = parseInt(getopt.arg); - else if (c == 'g') gap_thres = parseInt(getopt.arg); - else if (c == 'q') min_mapq = parseInt(getopt.arg); -} - -if (arguments.length == getopt.ind) { - print("Usage: k8 paf2diff.js [options] "); - print("Options:"); - print(" -l INT min alignment length to compute coverage ["+min_cov_len+"]"); - print(" -L INT min alignment length to call variants ["+min_var_len+"]"); - print(" -q INT min mapping quality ["+min_mapq+"]"); - print(" -g INT short/long gap threshold (for statistics only) ["+gap_thres+"]"); - exit(1); -} - -var file = arguments[getopt.ind] == '-'? new File() : new File(arguments[getopt.ind]); -var buf = new Bytes(); -var tot_len = 0, n_sub = [0, 0, 0], n_ins = [0, 0, 0, 0], n_del = [0, 0, 0, 0]; - -function count_var(o) -{ - if (o[3] > 1) return; - if (o[5] == '-' && o[6] == '-') return; - if (o[5] == '-') { // insertion - var l = o[6].length; - if (l == 1) ++n_ins[0]; - else if (l == 2) ++n_ins[1]; - else if (l < gap_thres) ++n_ins[2]; - else ++n_ins[3]; - } else if (o[6] == '-') { // deletion - var l = o[5].length; - if (l == 1) ++n_del[0]; - else if (l == 2) ++n_del[1]; - else if (l < gap_thres) ++n_del[2]; - else ++n_del[3]; - } else { - ++n_sub[0]; - var s = o[5] + o[6]; - if (s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc') - ++n_sub[1]; - else ++n_sub[2]; - } -} - -var a = [], out = []; -var c1_ctg = null, c1_start = 0, c1_end = 0, c1_counted = false, c1_len = 0; -while (file.readline(buf) >= 0) { - var line = buf.toString(); - if (!/\ts2:i:/.test(line)) continue; // skip secondary alignments - var m, t = line.split("\t", 12); - for (var i = 6; i <= 11; ++i) - t[i] = parseInt(t[i]); - if (t[10] < min_cov_len || t[11] < min_mapq) continue; - var ctg = t[5], x = t[7], end = t[8]; - // compute regions covered by 1 contig - if (ctg != c1_ctg || x >= c1_end) { - if (c1_counted && c1_end > c1_start) { - c1_len += c1_end - c1_start; - print('R', c1_ctg, c1_start, c1_end); - } - c1_ctg = ctg, c1_start = x, c1_end = end; - c1_counted = (t[10] >= min_var_len); - } else if (end > c1_end) { // overlap - if (c1_counted && x > c1_start) { - c1_len += x - c1_start; - print('R', c1_ctg, c1_start, x); - } - c1_start = c1_end, c1_end = end; - c1_counted = (t[10] >= min_var_len); - } else { // contained - if (c1_counted && x > c1_start) { - c1_len += x - c1_start; - print('R', c1_ctg, c1_start, x); - } - c1_start = end; - } - // output variants ahead of this alignment - while (out.length) { - if (out[0][0] != ctg || out[0][2] <= x) { - count_var(out[0]); - print('V', out[0].join("\t")); - out.shift(); - } else break; - } - // update coverage - for (var i = 0; i < out.length; ++i) - if (out[i][1] >= x && out[i][2] <= end) - ++out[i][3]; - // drop alignments that don't overlap with the current one - var k = 0; - for (var i = 0; i < a.length; ++i) - if (a[0][0] == ctg && a[0][2] > x) - a[k++] = a[i]; - a.length = k; - // core loop - if (t[10] >= min_var_len) { - if ((m = /\tcs:Z:(\S+)/.exec(line)) == null) continue; // no cs tag - var cs = m[1]; - var blen = 0, n_diff = 0; - tot_len += t[10]; - while ((m = re_cs.exec(cs)) != null) { - var cov = 1; - if (m[1] == '*' || m[1] == '+' || m[1] == '-') - for (var i = 0; i < a.length; ++i) - if (a[0][2] > x) ++cov; - if (m[1] == '=' || m[1] == ':') { - var l = m[1] == '='? m[2].length : parseInt(m[2]); - x += l, blen += l; - } else if (m[1] == '*') { - out.push([t[5], x, x+1, cov, t[11], m[2].charAt(0), m[2].charAt(1)]); - ++x, ++blen, ++n_diff; - } else if (m[1] == '+') { - out.push([t[5], x, x, cov, t[11], '-', m[2]]); - ++blen, ++n_diff; - } else if (m[1] == '-') { - out.push([t[5], x, x + m[2].length, cov, t[11], m[2], '-']); - x += m[2].length, ++blen, ++n_diff; - } - } - } - a.push([t[5], t[7], t[8]]); -} -if (c1_counted && c1_end > c1_start) { - c1_len += c1_end - c1_start; - print('R', c1_ctg, c1_start, c1_end); -} -while (out.length) { - count_var(out[0]); - print('V', out[0].join("\t")); - out.shift(); -} - -//warn(tot_len + " alignment columns considered in calling"); -warn(c1_len + " reference bases covered by exactly one contig"); -warn(n_sub[0] + " substitutions; ts/tv = " + (n_sub[1]/n_sub[2]).toFixed(3)); -warn(n_del[0] + " 1bp deletions"); -warn(n_ins[0] + " 1bp insertions"); -warn(n_del[1] + " 2bp deletions"); -warn(n_ins[1] + " 2bp insertions"); -warn(n_del[2] + " [3,"+gap_thres+") deletions"); -warn(n_ins[2] + " [3,"+gap_thres+") insertions"); -warn(n_del[3] + " >="+gap_thres+" deletions"); -warn(n_ins[3] + " >="+gap_thres+" insertions"); - -buf.destroy(); -file.close(); diff --git a/misc/paftools.js b/misc/paftools.js index 6f51a43..c8c9298 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -329,6 +329,155 @@ function paf_call(args) file.close(); } +function paf_stat(args) +{ + var c, gap_out_len = null; + while ((c = getopt(args, "l:")) != null) + if (c == 'l') gap_out_len = parseInt(getopt.arg); + + if (getopt.ind == args.length) { + print("Usage: paftools.js stat [-l gapOutLen] |"); + exit(1); + } + + var buf = new Bytes(); + var file = new File(args[getopt.ind]); + var re = /(\d+)([MIDSHNX=])/g; + + var lineno = 0, n_pri = 0, n_2nd = 0, n_seq = 0, n_cigar_64k = 0, l_tot = 0, l_cov = 0; + var n_gap = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]; + + function cov_len(regs) + { + regs.sort(function(a,b) {return a[0]-b[0]}); + var st = regs[0][0], en = regs[0][1], l = 0; + for (var i = 1; i < regs.length; ++i) { + if (regs[i][0] < en) + en = en > regs[i][1]? en : regs[i][1]; + else l += en - st, st = regs[i][0], en = regs[i][1]; + } + l += en - st; + return l; + } + + var last = null, last_qlen = null, regs = []; + while (file.readline(buf) >= 0) { + var line = buf.toString(); + ++lineno; + if (line.charAt(0) != '@') { + var t = line.split("\t", 12); + var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null; + var atlen = null, aqlen, qs, qe, mapq, ori_qlen; + if (t[4] == '+' || t[4] == '-') { // PAF + if (!/\ts2:i:\d+/.test(line)) { + ++n_2nd; + continue; + } + if ((m = /\tcg:Z:(\S+)/.exec(line)) != null) + cigar = m[1]; + if (cigar == null) { + warn("WARNING: no CIGAR at line " + lineno); + continue; + } + tname = t[5]; + qs = parseInt(t[2]), qe = parseInt(t[3]); + aqlen = qe - qs; + is_rev = t[4] == '+'? false : true; + rs = parseInt(t[7]); + atlen = parseInt(t[8]) - rs; + mapq = parseInt(t[11]); + ori_qlen = parseInt(t[1]); + } else { // SAM + var flag = parseInt(t[1]); + if ((flag & 4) || t[2] == '*' || t[5] == '*') continue; + if (flag & 0x100) { + ++n_2nd; + continue; + } + cigar = t[5]; + tname = t[2]; + rs = parseInt(t[3]) - 1; + mapq = parseInt(t[4]); + aqlen = t[9].length; + is_sam = true; + is_rev = !!(flag&0x10); + } + ++n_pri; + if (last != t[0]) { + if (last != null) { + l_tot += last_qlen; + l_cov += cov_len(regs); + } + regs = []; + ++n_seq, last = t[0]; + } + var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0; + while ((m = re.exec(cigar)) != null) { + var l = parseInt(m[1]); + ++n_cigar; + if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') { + tl += l, ql += l, M += l; + } else if (m[2] == 'I' || m[2] == 'D') { + var type; + if (l < 50) type = 0; + else if (l < 100) type = 1; + else if (l < 300) type = 2; + else if (l < 400) type = 3; + else if (l < 1000) type = 4; + else type = 5; + if (m[2] == 'I') ql += l, ++n_gap[0][type]; + else tl += l, ++n_gap[1][type]; + if (gap_out_len != null && l >= gap_out_len) + print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l); + } else if (m[2] == 'N') { + tl += l; + } else if (m[2] == 'S') { + clip[M == 0? 0 : 1] = l, sclip += l; + } else if (m[2] == 'H') { + clip[M == 0? 0 : 1] = l; + } + } + if (n_cigar > 65535) ++n_cigar_64k; + if (ql + sclip != aqlen) + warn("WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + (ql+sclip) + " != " + aqlen + ")"); + if (atlen != null && atlen != tl) + warn("WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno); + if (is_sam) { + qs = clip[is_rev? 1 : 0], qe = qs + ql; + ori_qlen = clip[0] + ql + clip[1]; + } + regs.push([qs, qe]); + last_qlen = ori_qlen; + } + } + l_tot += last_qlen; + l_cov += cov_len(regs); + + file.close(); + buf.destroy(); + + if (gap_out_len == null) { + print("Number of mapped sequences: " + n_seq); + print("Number of primary alignments: " + n_pri); + print("Number of secondary alignments: " + n_2nd); + print("Number of primary alignments with >65535 CIGAR operations: " + n_cigar_64k); + print("Number of bases in mapped sequences: " + l_tot); + print("Number of mapped bases: " + l_cov); + print("Number of insertions in [0,50): " + n_gap[0][0]); + print("Number of insertions in [50,100): " + n_gap[0][1]); + print("Number of insertions in [100,300): " + n_gap[0][2]); + print("Number of insertions in [300,400): " + n_gap[0][3]); + print("Number of insertions in [400,1000): " + n_gap[0][4]); + print("Number of insertions in [1000,inf): " + n_gap[0][5]); + print("Number of deletions in [0,50): " + n_gap[1][0]); + print("Number of deletions in [50,100): " + n_gap[1][1]); + print("Number of deletions in [100,300): " + n_gap[1][2]); + print("Number of deletions in [300,400): " + n_gap[1][3]); + print("Number of deletions in [400,1000): " + n_gap[1][4]); + print("Number of deletions in [1000,inf): " + n_gap[1][5]); + } +} + /************************** *** Conversion related *** **************************/ @@ -495,6 +644,122 @@ function paf_view(args) s_ref.destroy(); s_qry.destroy(); s_mid.destroy(); } +function paf_gff2bed(args) +{ + var c, fn_ucsc_fai = null, is_short = false; + while ((c = getopt(args, "u:s")) != null) { + if (c == 'u') fn_ucsc_fai = getopt.arg; + else if (c == 's') is_short = true; + } + + if (getopt.ind == args.length) { + print("Usage: paftools.js gff2bed [-u ucsc-genome.fa.fai] "); + exit(1); + } + + var ens2ucsc = {}; + if (fn_ucsc_fai != null) { + var buf = new Bytes(); + var file = new File(fn_ucsc_fai); + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + var s = t[0]; + if (/_(random|alt|decoy)$/.test(s)) { + s = s.replace(/_(random|alt|decoy)$/, ''); + s = s.replace(/^chr\S+_/, ''); + } else { + s = s.replace(/^chrUn_/, ''); + } + s = s.replace(/v(\d+)/, ".$1"); + if (s != t[0]) ens2ucsc[s] = t[0]; + } + file.close(); + buf.destroy(); + } + + var colors = { + 'protein_coding':'0,128,255', + 'lincRNA':'0,192,0', + 'snRNA':'0,192,0', + 'miRNA':'0,192,0', + 'misc_RNA':'0,192,0' + }; + + function print_bed12(exons, cds_st, cds_en, is_short) + { + if (exons.length == 0) return; + var name = is_short? exons[0][7] + "|" + exons[0][5] : exons[0].slice(4, 7).join("|"); + var a = exons.sort(function(a,b) {return a[1]-b[1]}); + var sizes = [], starts = [], st, en; + st = a[0][1]; + en = a[a.length - 1][2]; + if (cds_st == 1<<30) cds_st = st; + if (cds_en == 0) cds_en = en; + if (cds_st < st || cds_en > en) + throw Error("inconsistent thick start or end for transcript " + a[0][4]); + for (var i = 0; i < a.length; ++i) { + sizes.push(a[i][2] - a[i][1]); + starts.push(a[i][1] - st); + } + var color = colors[a[0][5]]; + if (color == null) color = '196,196,196'; + print(a[0][0], st, en, name, 1000, a[0][3], cds_st, cds_en, color, a.length, sizes.join(",") + ",", starts.join(",") + ","); + } + + var re_gtf = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name) "([^"]+)";/g; + var re_gff3 = /(transcript_id|transcript_type|transcript_biotype|gene_name|transcript_name)=([^;]+)/g; + var buf = new Bytes(); + var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]); + + var exons = [], cds_st = 1<<30, cds_en = 0, last_id = null; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (t[0].charAt(0) == '#') continue; + if (t[2] != "CDS" && t[2] != "exon") continue; + t[3] = parseInt(t[3]) - 1; + t[4] = parseInt(t[4]); + var id = null, type = "", gname = "N/A", biotype = "", m, tname = "N/A"; + while ((m = re_gtf.exec(t[8])) != null) { + if (m[1] == "transcript_id") id = m[2]; + else if (m[1] == "transcript_type") type = m[2]; + else if (m[1] == "transcript_biotype") biotype = m[2]; + else if (m[1] == "gene_name") name = m[2]; + else if (m[1] == "transcript_name") tname = m[2]; + } + while ((m = re_gff3.exec(t[8])) != null) { + if (m[1] == "transcript_id") id = m[2]; + else if (m[1] == "transcript_type") type = m[2]; + else if (m[1] == "transcript_biotype") biotype = m[2]; + else if (m[1] == "gene_name") name = m[2]; + else if (m[1] == "transcript_name") tname = m[2]; + } + if (type == "" && biotype != "") type = biotype; + if (id == null) throw Error("No transcript_id"); + if (id != last_id) { + print_bed12(exons, cds_st, cds_en, is_short); + exons = [], cds_st = 1<<30, cds_en = 0; + last_id = id; + } + if (t[2] == "CDS") { + cds_st = cds_st < t[3]? cds_st : t[3]; + cds_en = cds_en > t[4]? cds_en : t[4]; + } else if (t[2] == "exon") { + if (fn_ucsc_fai != null) { + if (ens2ucsc[t[0]] != null) + t[0] = ens2ucsc[t[0]]; + else if (/^[A-Z]+\d+\.\d+$/.test(t[0])) + t[0] = t[0].replace(/([A-Z]+\d+)\.(\d+)/, "chrUn_$1v$2"); + } + exons.push([t[0], t[3], t[4], t[6], id, type, name, tname]); + } + } + if (last_id != null) + print_bed12(exons, cds_st, cds_en, is_short); + + file.close(); + buf.destroy(); +} + function paf_sam2paf(args) { var c, pri_only = false; @@ -1081,7 +1346,9 @@ function main(args) print(" view convert PAF to BLAST-like (for eyeballing) or MAF"); print(" sam2paf convert SAM to PAF"); print(" splice2bed convert spliced alignment in PAF/SAM to BED12"); + print(" gff2bed convert GTF/GFF3 to BED12"); print(""); + print(" stat collect basic mapping information in PAF/SAM"); print(" call call variants from asm-to-ref alignment with the cs tag"); print(""); print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); @@ -1095,6 +1362,8 @@ function main(args) if (cmd == 'view') paf_view(args); else if (cmd == 'sam2paf') paf_sam2paf(args); else if (cmd == 'splice2bed') paf_splice2bed(args); + else if (cmd == 'gff2bed') paf_gff2bed(args); + else if (cmd == 'stat') paf_stat(args); else if (cmd == 'call') paf_call(args); else if (cmd == 'mapeval') paf_mapeval(args); else if (cmd == 'mason2fq') paf_mason2fq(args);