diff --git a/misc/intron-eval.js b/misc/intron-eval.js deleted file mode 100644 index 704b29f..0000000 --- a/misc/intron-eval.js +++ /dev/null @@ -1,267 +0,0 @@ -/******************************* - * Command line option parsing * - *******************************/ - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -/*********************** - * Interval operations * - ***********************/ - -Interval = {}; - -Interval.sort = function(a) -{ - if (typeof a[0] == 'number') - a.sort(function(x, y) { return x - y }); - else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] }); -} - -Interval.merge = function(a, sorted) -{ - if (typeof sorted == 'undefined') sorted = true; - if (!sorted) Interval.sort(a); - var k = 0; - for (var i = 1; i < a.length; ++i) { - if (a[k][1] >= a[i][0]) - a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1]; - else a[++k] = a[i].slice(0); - } - a.length = k + 1; -} - -Interval.index_end = function(a, sorted) -{ - if (a.length == 0) return; - if (typeof sorted == 'undefined') sorted = true; - if (!sorted) Interval.sort(a); - a[0].push(0); - var k = 0, k_en = a[0][1]; - for (var i = 1; i < a.length; ++i) { - if (k_en <= a[i][0]) { - for (++k; k < i; ++k) - if (a[k][1] > a[i][0]) - break; - k_en = a[k][1]; - } - a[i].push(k); - } -} - -Interval.find_intv = function(a, x) -{ - var left = -1, right = a.length; - if (typeof a[0] == 'number') { - while (right - left > 1) { - var mid = left + ((right - left) >> 1); - if (a[mid] > x) right = mid; - else if (a[mid] < x) left = mid; - else return mid; - } - } else { - while (right - left > 1) { - var mid = left + ((right - left) >> 1); - if (a[mid][0] > x) right = mid; - else if (a[mid][0] < x) left = mid; - else return mid; - } - } - return left; -} - -Interval.find_ovlp = function(a, st, en) -{ - if (a.length == 0 || st >= en) return []; - var l = Interval.find_intv(a, st); - var k = l < 0? 0 : a[l][a[l].length - 1]; - var b = []; - for (var i = k; i < a.length; ++i) { - if (a[i][0] >= en) break; - else if (st < a[i][1]) - b.push(a[i]); - } - return b; -} - -/***************** - * Main function * - *****************/ - -var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false; -while ((c = getopt(arguments, "l:ep")) != null) { - if (c == 'l') l_fuzzy = parseInt(getopt.arg); - else if (c == 'e') print_err_only = print_ovlp = true; - else if (c == 'p') print_ovlp = true; -} - -if (arguments.length - getopt.ind < 2) { - print("Usage: k8 intron-eval.js [options] "); - exit(1); -} - -var file, buf = new Bytes(); - -var tr = {}; -file = new File(arguments[getopt.ind]); -while (file.readline(buf) >= 0) { - var m, t = buf.toString().split("\t"); - if (t[0].charAt(0) == '#') continue; - if (t[2] != 'exon') continue; - var st = parseInt(t[3]) - 1; - var en = parseInt(t[4]); - if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue; - var tid = m[1]; - if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []]; - tr[tid][4].push([st, en]); -} -file.close(); - -var anno = {}; -for (var tid in tr) { - var t = tr[tid]; - Interval.sort(t[4]); - t[2] = t[4][0][0]; - t[3] = t[4][t[4].length - 1][1]; - if (anno[t[0]] == null) anno[t[0]] = []; - var s = t[4]; - for (var i = 0; i < s.length - 1; ++i) { - if (s[i][1] >= s[i+1][0]) - warn("WARNING: incorrect annotation for transcript "+tid+" ("+s[i][1]+" >= "+s[i+1][0]+")") - anno[t[0]].push([s[i][1], s[i+1][0]]); - } -} -tr = null; - -for (var chr in anno) { - var e = anno[chr]; - if (e.length == 0) continue; - Interval.sort(e); - var k = 0; - for (var i = 1; i < e.length; ++i) // dedup - if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) - e[++k] = e[i].slice(0); - e.length = k + 1; - Interval.index_end(e); -} - -var n_pri = 0, n_unmapped = 0, n_mapped = 0; -var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0; - -file = new File(arguments[getopt.ind+1]); -var last_qname = null; -var re_cigar = /(\d+)([MIDNSHX=])/g; -while (file.readline(buf) >= 0) { - var m, t = buf.toString().split("\t"); - - if (t[0].charAt(0) == '@') continue; - var flag = parseInt(t[1]); - if (flag&0x100) continue; - if (first_only && last_qname == t[0]) continue; - if (t[2] == '*') { - ++n_unmapped; - continue; - } else { - ++n_pri; - if (last_qname != t[0]) ++n_mapped; - } - - var pos = parseInt(t[3]) - 1, intron = []; - while ((m = re_cigar.exec(t[5])) != null) { - var len = parseInt(m[1]), op = m[2]; - if (op == 'N') { - intron.push([pos, pos + len]); - pos += len; - } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len; - } - if (intron.length == 0) { - ++n_sgl; - continue; - } - n_splice += intron.length; - - var chr = anno[t[2]]; - if (chr != null) { - for (var i = 0; i < intron.length; ++i) { - var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]); - if (o.length > 0) { - var hit = false; - for (var j = 0; j < o.length; ++j) { - var st_diff = intron[i][0] - o[j][0]; - var en_diff = intron[i][1] - o[j][1]; - if (st_diff < 0) st_diff = -st_diff; - if (en_diff < 0) en_diff = -en_diff; - if (st_diff <= l_fuzzy && en_diff <= l_fuzzy) - ++n_splice_hit, hit = true; - if (hit) break; - } - if (print_ovlp) { - var type = hit? 'C' : 'P'; - if (hit && print_err_only) continue; - var x = '['; - for (var j = 0; j < o.length; ++j) { - if (j) x += ', '; - x += '(' + o[j][0] + "," + o[j][1] + ')'; - } - x += ']'; - print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x); - } - } else { - ++n_splice_novel; - if (print_ovlp) - print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]); - } - } - } else { - n_splice_novel += intron.length; - } - last_qname = t[0]; -} -file.close(); - -buf.destroy(); - -if (!print_ovlp) { - print("# unmapped reads: " + n_unmapped); - print("# mapped reads: " + n_mapped); - print("# primary alignments: " + n_pri); - print("# singletons: " + n_sgl); - print("# predicted introns: " + n_splice); - print("# non-overlapping introns: " + n_splice_novel); - print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)"); -} diff --git a/misc/paftools.js b/misc/paftools.js index c8c9298..903e06e 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1262,6 +1262,156 @@ function paf_pbsim2fq(args) buf2.destroy(); } +function paf_junceval(args) // FIXME: the reported number of mapped reads is slightly off +{ + var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false; + while ((c = getopt(args, "l:ep")) != null) { + if (c == 'l') l_fuzzy = parseInt(getopt.arg); + else if (c == 'e') print_err_only = print_ovlp = true; + else if (c == 'p') print_ovlp = true; + } + + if (args.length - getopt.ind < 2) { + print("Usage: paftools.js junceval [options] "); + print("Options:"); + print(" -l INT tolerance of junction positions (0 for exact) [0]"); + print(" -p print overlapping introns"); + print(" -e print erroreous overlapping introns"); + exit(1); + } + + var file, buf = new Bytes(); + + var tr = {}; + file = new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + if (t[0].charAt(0) == '#') continue; + if (t[2] != 'exon') continue; + var st = parseInt(t[3]) - 1; + var en = parseInt(t[4]); + if ((m = /transcript_id "(\S+)"/.exec(t[8])) == null) continue; + var tid = m[1]; + if (tr[tid] == null) tr[tid] = [t[0], t[6], 0, 0, []]; + tr[tid][4].push([st, en]); + } + file.close(); + + var anno = {}; + for (var tid in tr) { + var t = tr[tid]; + Interval.sort(t[4]); + t[2] = t[4][0][0]; + t[3] = t[4][t[4].length - 1][1]; + if (anno[t[0]] == null) anno[t[0]] = []; + var s = t[4]; + for (var i = 0; i < s.length - 1; ++i) { + if (s[i][1] >= s[i+1][0]) + warn("WARNING: incorrect annotation for transcript "+tid+" ("+s[i][1]+" >= "+s[i+1][0]+")") + anno[t[0]].push([s[i][1], s[i+1][0]]); + } + } + tr = null; + + for (var chr in anno) { + var e = anno[chr]; + if (e.length == 0) continue; + Interval.sort(e); + var k = 0; + for (var i = 1; i < e.length; ++i) // dedup + if (e[i][0] != e[k][0] || e[i][1] != e[k][1]) + e[++k] = e[i].slice(0); + e.length = k + 1; + Interval.index_end(e); + } + + var n_pri = 0, n_unmapped = 0, n_mapped = 0; + var n_sgl = 0, n_splice = 0, n_splice_hit = 0, n_splice_novel = 0; + + file = new File(args[getopt.ind+1]); + var last_qname = null; + var re_cigar = /(\d+)([MIDNSHX=])/g; + while (file.readline(buf) >= 0) { + var m, t = buf.toString().split("\t"); + + if (t[0].charAt(0) == '@') continue; + var flag = parseInt(t[1]); + if (flag&0x100) continue; + if (first_only && last_qname == t[0]) continue; + if (t[2] == '*') { + ++n_unmapped; + continue; + } else { + ++n_pri; + if (last_qname != t[0]) ++n_mapped; + } + + var pos = parseInt(t[3]) - 1, intron = []; + while ((m = re_cigar.exec(t[5])) != null) { + var len = parseInt(m[1]), op = m[2]; + if (op == 'N') { + intron.push([pos, pos + len]); + pos += len; + } else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len; + } + if (intron.length == 0) { + ++n_sgl; + continue; + } + n_splice += intron.length; + + var chr = anno[t[2]]; + if (chr != null) { + for (var i = 0; i < intron.length; ++i) { + var o = Interval.find_ovlp(chr, intron[i][0], intron[i][1]); + if (o.length > 0) { + var hit = false; + for (var j = 0; j < o.length; ++j) { + var st_diff = intron[i][0] - o[j][0]; + var en_diff = intron[i][1] - o[j][1]; + if (st_diff < 0) st_diff = -st_diff; + if (en_diff < 0) en_diff = -en_diff; + if (st_diff <= l_fuzzy && en_diff <= l_fuzzy) + ++n_splice_hit, hit = true; + if (hit) break; + } + if (print_ovlp) { + var type = hit? 'C' : 'P'; + if (hit && print_err_only) continue; + var x = '['; + for (var j = 0; j < o.length; ++j) { + if (j) x += ', '; + x += '(' + o[j][0] + "," + o[j][1] + ')'; + } + x += ']'; + print(type, t[0], i+1, t[2], intron[i][0], intron[i][1], x); + } + } else { + ++n_splice_novel; + if (print_ovlp) + print('N', t[0], i+1, t[2], intron[i][0], intron[i][1]); + } + } + } else { + n_splice_novel += intron.length; + } + last_qname = t[0]; + } + file.close(); + + buf.destroy(); + + if (!print_ovlp) { + print("# unmapped reads: " + n_unmapped); + print("# mapped reads: " + n_mapped); + print("# primary alignments: " + n_pri); + print("# singletons: " + n_sgl); + print("# predicted introns: " + n_splice); + print("# non-overlapping introns: " + n_splice_novel); + print("# correct introns: " + n_splice_hit + " (" + (n_splice_hit / n_splice * 100).toFixed(2) + "%)"); + } +} + // evaluate overlap sensitivity function paf_ov_eval(args) { @@ -1354,6 +1504,7 @@ function main(args) print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); print(" mason2fq convert mason2-simulated SAM to FASTQ"); print(" pbsim2fq convert PBSIM-simulated MAF to FASTQ"); + print(" junceval evaluate splice junction consistency with known annotations"); print(" ov-eval evaluate read overlap sensitivity using read-to-ref mapping"); exit(1); } @@ -1368,6 +1519,7 @@ function main(args) else if (cmd == 'mapeval') paf_mapeval(args); else if (cmd == 'mason2fq') paf_mason2fq(args); else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); + else if (cmd == 'junceval') paf_junceval(args); else if (cmd == 'ov-eval') paf_ov_eval(args); else throw Error("unrecognized command: " + cmd); }