diff --git a/misc/paftools.js b/misc/paftools.js index 6a186b4..75170a2 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.24-r1141-dirty'; +var paftools_version = '2.24-r1151-dirty'; /***************************** ***** Library functions ***** @@ -2554,6 +2554,217 @@ function paf_junceval(args) } } +function paf_exoneval(args) // adapted from paf_junceval() +{ + var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false, use_cds = false; + while ((c = getopt(args, "l:epcab1d")) != null) { + if (c == 'l') l_fuzzy = parseInt(getopt.arg); + else if (c == 'e') print_err_only = print_ovlp = true; + else if (c == 'p') print_ovlp = true; + else if (c == 'c') chr_only = true; + else if (c == 'a') aa = true; + else if (c == 'b') is_bed = true; + else if (c == '1') first_only = true; + else if (c == 'd') use_cds = true; + } + + if (args.length - getopt.ind < 1) { + print("Usage: paftools.js exoneval [options] "); + print("Options:"); + print(" -l INT tolerance of junction positions (0 for exact) [0]"); + print(" -p print overlapping introns"); + print(" -e print erroreous overlapping introns"); + print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/"); + print(" -a miniprot PAF as input"); + print(" -b BED as input"); + print(" -1 only process the first alignment of each query"); + exit(1); + } + + var file, buf = new Bytes(); + + var tr = {}; + file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '#') continue; + if (use_cds) { + if (t[2] != "cds" && t[2] != "CDS") continue; + } else { + if (t[2] != 'exon') continue; + } + var st = parseInt(t[3]) - 1; + var en = parseInt(t[4]); + if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue; + var tid = m[1]; + if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []]; + tr[tid][4].push([st, en]); // this keeps transcript + } + file.close(); + + var anno = {}; + for (var tid in tr) { // traverse each transcript + var t = tr[tid]; + Interval.sort(t[4]); + t[2] = t[4][0][0]; + t[3] = t[4][t[4].length - 1][1]; + if (anno[t[0]] == null) anno[t[0]] = []; + var s = t[4]; + for (var i = 0; i < s.length; ++i) // traverse each exon + anno[t[0]].push([s[i][0], s[i][1]]); + } + tr = null; + + for (var chr in anno) { // index exons + var e = anno[chr]; + if (e.length == 0) continue; + Interval.sort(e); + var k = 0; + for (var i = 1; i < e.length; ++i) // dedup + if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) + e[++k] = e[i].slice(0); + e.length = k + 1; + Interval.index_end(e); + } + + var n_pri = 0, n_unmapped = 0, n_mapped = 0; + var n_exon = 0, n_exon_hit = 0, n_exon_novel = 0; + + file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]); + var last_qname = null; + var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g; + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + var ctg_name = null, cigar = null, pos = null, qname; + + if (t[0].charAt(0) == '@') continue; + if (t[0] == "##PAF") t.shift(); + qname = t[0]; + if (is_bed) { + ctg_name = t[0], pos = parseInt(t[1]), cigar == null; + } else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF + ctg_name = t[5], pos = parseInt(t[7]); + var type = 'P'; + for (i = 12; i < t.length; ++i) { + if ((m = /^(tp:A|cg:Z):(\S+)/.exec(t[i])) != null) { + if (m[1] == 'tp:A') type = m[2]; + else cigar = m[2]; + } + } + if (type == 'S') continue; // secondary + } else { // SAM + ctg_name = t[2], pos = parseInt(t[3]) - 1, cigar = t[5]; + var flag = parseInt(t[1]); + if (flag&0x100) continue; // secondary + } + + if (chr_only && !/^(chr)?([0-9]+|X|Y)$/.test(ctg_name)) continue; + if (first_only && last_qname == qname) continue; + if (ctg_name == '*') { // unmapped + ++n_unmapped; + continue; + } else { + ++n_pri; + if (last_qname != qname) { + ++n_mapped; + last_qname = qname; + } + } + + var exon = []; + if (is_bed) { // BED + exon.push([pos, parseInt(t[2])]); + } else if (aa) { + var tmp_exon = [], tmp = 0, tmp_st = 0; + while ((m = re_cigar.exec(cigar)) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + tmp_exon.push([tmp_st, tmp]); + tmp_st = tmp + len, tmp += len; + } else if (op == 'U') { + tmp_exon.push([tmp_st, tmp + 1]); + tmp_st = tmp + len - 2, tmp += len; + } else if (op == 'V') { + tmp_exon.push([tmp_st, tmp + 2]); + tmp_st = tmp + len - 1, tmp += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') { + tmp += len * 3; + } else if (op == 'F' || op == 'G') { + tmp += len; + } + } + tmp_exon.push([tmp_st, tmp]); + if (t[4] == '+') { + for (var i = 0; i < tmp_exon.length; ++i) + exon.push([pos + tmp_exon[i][0], pos + tmp_exon[i][1]]); + } else if (t[4] == '-') { // For protein-to-genome alignment, the coordinates are on the query strand. Need to flip them. + var glen = parseInt(t[8]) - parseInt(t[7]); + for (var i = tmp_exon.length - 1; i >= 0; --i) + exon.push([pos + (glen - tmp_exon[i][1]), pos + (glen - tmp_exon[i][0])]); + } + } else { + var tmp_st = pos; + while ((m = re_cigar.exec(cigar)) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + exon.push([tmp_st, pos]); + tmp_st = pos + len, pos += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len; + } + exon.push([tmp_st, pos]); + } + n_exon += exon.length; + + var chr = anno[ctg_name]; + if (chr != null) { + for (var i = 0; i < exon.length; ++i) { + var o = Interval.find_ovlp(chr, exon[i][0], exon[i][1]); + if (o.length > 0) { + var hit = false; + for (var j = 0; j < o.length; ++j) { + var st_diff = exon[i][0] - o[j][0]; + var en_diff = exon[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; + if (st_diff <= l_fuzzy && en_diff <= l_fuzzy) + ++n_exon_hit, hit = true; + if (hit) break; + } + if (print_ovlp) { + var type = hit? 'C' : 'P'; + if (hit && print_err_only) continue; + var x = '['; + for (var j = 0; j < o.length; ++j) { + if (j) x += ', '; + x += '(' + o[j][0] + "," + o[j][1] + ')'; + } + x += ']'; + print(type, qname, i+1, ctg_name, exon[i][0], exon[i][1], x); + } + } else { + ++n_exon_novel; + if (print_ovlp) + print('N', qname, i+1, ctg_name, exon[i][0], exon[i][1]); + } + } + } else { + n_exon_novel += exon.length; + } + } + file.close(); + + buf.destroy(); + + if (!print_ovlp) { + print("# unmapped reads: " + n_unmapped); + print("# mapped reads: " + n_mapped); + print("# primary alignments: " + n_pri); + print("# predicted exons: " + n_exon); + print("# non-overlapping exons: " + n_exon_novel); + print("# correct exons: " + n_exon_hit + " (" + (n_exon_hit / n_exon * 100).toFixed(2) + "%)"); + } +} + // evaluate overlap sensitivity function paf_ov_eval(args) { @@ -3362,6 +3573,7 @@ function main(args) print(" mason2fq convert mason2-simulated SAM to FASTQ"); print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ"); print(" junceval evaluate splice junction consistency with known annotations"); + print(" exoneval evaluate exon-level consistency with known annotations"); print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping"); exit(1); } @@ -3386,6 +3598,7 @@ function main(args) else if (cmd == 'mason2fq') paf_mason2fq(args); else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); else if (cmd == 'junceval') paf_junceval(args); + else if (cmd == 'exoneval') paf_exoneval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); else if (cmd == 'vcfstat') paf_vcfstat(args); else if (cmd == 'sveval') paf_sveval(args);