diff --git a/misc/cnt-feat.js b/misc/cnt-feat.js deleted file mode 100644 index d0912b7..0000000 --- a/misc/cnt-feat.js +++ /dev/null @@ -1,258 +0,0 @@ -/******************************* - * Command line option parsing * - *******************************/ - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -/*********************** - * Interval operations * - ***********************/ - -Interval = {}; - -Interval.sort = function(a) -{ - if (typeof a[0] == 'number') - a.sort(function(x, y) { return x - y }); - else a.sort(function(x, y) { return x[0] != y[0]? x[0] - y[0] : x[1] - y[1] }); -} - -Interval.merge = function(a, sorted) -{ - if (typeof sorted == 'undefined') sorted = true; - if (!sorted) Interval.sort(a); - var k = 0; - for (var i = 1; i < a.length; ++i) { - if (a[k][1] >= a[i][0]) - a[k][1] = a[k][1] > a[i][1]? a[k][1] : a[i][1]; - else a[++k] = a[i].slice(0); - } - a.length = k + 1; -} - -Interval.dedup = function(a, sorted) -{ - if (typeof sorted == 'undefined') sorted = true; - if (!sorted) Interval.sort(a); - var k = 0; - for (var i = 1; i < a.length; ++i) - if (a[k][0] != a[i][0] || a[k][1] != a[i][1]) - a[++k] = a[i].slice(0); - a.length = k + 1; -} - -Interval.index_end = function(a, sorted) -{ - if (a.length == 0) return; - if (typeof sorted == 'undefined') sorted = true; - if (!sorted) Interval.sort(a); - a[0].push(0); - var k = 0, k_en = a[0][1]; - for (var i = 1; i < a.length; ++i) { - if (k_en <= a[i][0]) { - for (++k; k < i; ++k) - if (a[k][1] > a[i][0]) - break; - k_en = a[k][1]; - } - a[i].push(k); - } -} - -Interval.find_intv = function(a, x) -{ - var left = -1, right = a.length; - if (typeof a[0] == 'number') { - while (right - left > 1) { - var mid = left + ((right - left) >> 1); - if (a[mid] > x) right = mid; - else if (a[mid] < x) left = mid; - else return mid; - } - } else { - while (right - left > 1) { - var mid = left + ((right - left) >> 1); - if (a[mid][0] > x) right = mid; - else if (a[mid][0] < x) left = mid; - else return mid; - } - } - return left; -} - -Interval.find_ovlp = function(a, st, en) -{ - if (a.length == 0 || st >= en) return []; - var l = Interval.find_intv(a, st); - var k = l < 0? 0 : a[l][a[l].length - 1]; - var b = []; - for (var i = k; i < a.length; ++i) { - if (a[i][0] >= en) break; - else if (st < a[i][1]) - b.push(a[i]); - } - return b; -} - -/***************** - * Main function * - *****************/ - -function read_bed(fn, to_merge, to_dedup) -{ - var file = new File(fn); - var buf = new Bytes(); - var h = {}; - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - if (h[t[0]] == null) - h[t[0]] = []; - var bst = parseInt(t[1]); - var ben = parseInt(t[2]); - if (t.length >= 12 && /^\d+$/.test(t[9])) { - t[9] = parseInt(t[9]); - var sz = t[10].split(","); - var st = t[11].split(","); - for (var i = 0; i < t[9]; ++i) { - st[i] = parseInt(st[i]); - sz[i] = parseInt(sz[i]); - h[t[0]].push([bst + st[i], bst + st[i] + sz[i], 0, 0, 0]); - } - } else { - h[t[0]].push([bst, ben, 0, 0, 0]); - } - } - buf.destroy(); - file.close(); - for (var chr in h) { - if (to_merge) Interval.merge(h[chr], false); - else if (to_dedup) Interval.dedup(h[chr], false); - else Interval.sort(h[chr]); - Interval.index_end(h[chr]); - } - return h; -} - -function main(args) -{ - var c, print_len = false, to_merge = true, to_dedup = false, fn_excl = null; - while ((c = getopt(args, "pde:")) != null) { - if (c == 'p') print_len = true; - else if (c == 'd') to_dedup = true, to_merge = false; - else if (c == 'e') fn_excl = getopt.arg; - } - - if (args.length - getopt.ind < 2) { - print("Usage: k8 cnt-feat.js [options] "); - print("Options:"); - print(" -e FILE exclude features overlapping regions in BED FILE []"); - print(" -p print number of covered bases for each feature"); - exit(1); - } - - var excl = fn_excl != null? read_bed(fn_excl, true, false) : null; - var target = read_bed(args[getopt.ind], to_merge, to_dedup); - - var file, buf = new Bytes(); - var tot_len = 0, hit_len = 0; - file = args[getopt.ind+1] != '-'? new File(args[getopt.ind+1]) : new File(); - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - var a = []; - var bst = parseInt(t[1]); - var ben = parseInt(t[2]); - if (t.length >= 12 && /^\d+$/.test(t[9])) { // BED12 - t[9] = parseInt(t[9]); - var sz = t[10].split(","); - var st = t[11].split(","); - for (var i = 0; i < t[9]; ++i) { - st[i] = parseInt(st[i]); - sz[i] = parseInt(sz[i]); - a.push([bst + st[i], bst + st[i] + sz[i], false]); - } - } else a.push([bst, ben, false]); // 3-column BED - var feat_len = 0; - for (var i = 0; i < a.length; ++i) { - if (excl != null && excl[t[0]] != null) { - var oe = Interval.find_ovlp(excl[t[0]], a[i][0], a[i][1]); - if (oe.length > 0) - continue; - } - a[i][2] = true; - feat_len += a[i][1] - a[i][0]; - } - tot_len += feat_len; - if (target[t[0]] == null) continue; - var b = []; - for (var i = 0; i < a.length; ++i) { - if (!a[i][2]) continue; - var o = Interval.find_ovlp(target[t[0]], a[i][0], a[i][1]); - for (var j = 0; j < o.length; ++j) { - var max_st = o[j][0] > a[i][0]? o[j][0] : a[i][0]; - var min_en = o[j][1] < a[i][1]? o[j][1] : a[i][1]; - b.push([max_st, min_en]); - o[j][2] += min_en - max_st; - ++o[j][3]; - if (max_st == o[j][0] && min_en == o[j][1]) - ++o[j][4]; - } - } - // find the length covered - var feat_hit_len = 0; - if (b.length > 0) { - b.sort(function(a,b) {return a[0]-b[0]}); - var st = b[0][0], en = b[0][1]; - for (var i = 1; i < b.length; ++i) { - if (b[i][0] <= en) en = en > b[i][1]? en : b[i][1]; - else feat_hit_len += en - st, st = b[i][0], en = b[i][1]; - } - feat_hit_len += en - st; - } - hit_len += feat_hit_len; - if (print_len) print('F', t.slice(0, 4).join("\t"), feat_len, feat_hit_len); - } - file.close(); - - buf.destroy(); - - warn("# feature bases: " + tot_len); - warn("# feature bases overlapping targets: " + hit_len + ' (' + (100.0 * hit_len / tot_len).toFixed(2) + '%)'); -} - -main(arguments); diff --git a/misc/paftools.js b/misc/paftools.js index 9943c09..3700367 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -618,6 +618,127 @@ function paf_stat(args) } } +function paf_bedcov(args) +{ + function read_bed(fn, to_merge, to_dedup) + { + var file = new File(fn); + var buf = new Bytes(); + var h = {}; + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + if (h[t[0]] == null) + h[t[0]] = []; + var bst = parseInt(t[1]); + var ben = parseInt(t[2]); + if (t.length >= 12 && /^\d+$/.test(t[9])) { + t[9] = parseInt(t[9]); + var sz = t[10].split(","); + var st = t[11].split(","); + for (var i = 0; i < t[9]; ++i) { + st[i] = parseInt(st[i]); + sz[i] = parseInt(sz[i]); + h[t[0]].push([bst + st[i], bst + st[i] + sz[i], 0, 0, 0]); + } + } else { + h[t[0]].push([bst, ben, 0, 0, 0]); + } + } + buf.destroy(); + file.close(); + for (var chr in h) { + if (to_merge) Interval.merge(h[chr], false); + else if (to_dedup) Interval.dedup(h[chr], false); + else Interval.sort(h[chr]); + Interval.index_end(h[chr]); + } + return h; + } + + var c, print_len = false, to_merge = true, to_dedup = false, fn_excl = null; + while ((c = getopt(args, "pde:")) != null) { + if (c == 'p') print_len = true; + else if (c == 'd') to_dedup = true, to_merge = false; + else if (c == 'e') fn_excl = getopt.arg; + } + + if (args.length - getopt.ind < 2) { + print("Usage: paftools.js bedcov [options] "); + print("Options:"); + print(" -e FILE exclude target regions (2nd file) overlapping BED FILE []"); + print(" -p print number of covered bases for each target"); + exit(1); + } + + var excl = fn_excl != null? read_bed(fn_excl, true, false) : null; + var target = read_bed(args[getopt.ind], to_merge, to_dedup); + + var file, buf = new Bytes(); + var tot_len = 0, hit_len = 0; + file = args[getopt.ind+1] != '-'? new File(args[getopt.ind+1]) : new File(); + while (file.readline(buf) >= 0) { + var t = buf.toString().split("\t"); + var a = []; + var bst = parseInt(t[1]); + var ben = parseInt(t[2]); + if (t.length >= 12 && /^\d+$/.test(t[9])) { // BED12 + t[9] = parseInt(t[9]); + var sz = t[10].split(","); + var st = t[11].split(","); + for (var i = 0; i < t[9]; ++i) { + st[i] = parseInt(st[i]); + sz[i] = parseInt(sz[i]); + a.push([bst + st[i], bst + st[i] + sz[i], false]); + } + } else a.push([bst, ben, false]); // 3-column BED + var feat_len = 0; + for (var i = 0; i < a.length; ++i) { + if (excl != null && excl[t[0]] != null) { + var oe = Interval.find_ovlp(excl[t[0]], a[i][0], a[i][1]); + if (oe.length > 0) + continue; + } + a[i][2] = true; + feat_len += a[i][1] - a[i][0]; + } + tot_len += feat_len; + if (target[t[0]] == null) continue; + var b = []; + for (var i = 0; i < a.length; ++i) { + if (!a[i][2]) continue; + var o = Interval.find_ovlp(target[t[0]], a[i][0], a[i][1]); + for (var j = 0; j < o.length; ++j) { + var max_st = o[j][0] > a[i][0]? o[j][0] : a[i][0]; + var min_en = o[j][1] < a[i][1]? o[j][1] : a[i][1]; + b.push([max_st, min_en]); + o[j][2] += min_en - max_st; + ++o[j][3]; + if (max_st == o[j][0] && min_en == o[j][1]) + ++o[j][4]; + } + } + // find the length covered + var feat_hit_len = 0; + if (b.length > 0) { + b.sort(function(a,b) {return a[0]-b[0]}); + var st = b[0][0], en = b[0][1]; + for (var i = 1; i < b.length; ++i) { + if (b[i][0] <= en) en = en > b[i][1]? en : b[i][1]; + else feat_hit_len += en - st, st = b[i][0], en = b[i][1]; + } + feat_hit_len += en - st; + } + hit_len += feat_hit_len; + if (print_len) print('F', t.slice(0, 4).join("\t"), feat_len, feat_hit_len); + } + file.close(); + + buf.destroy(); + + warn("# feature bases: " + tot_len); + warn("# feature bases overlapping targets: " + hit_len + ' (' + (100.0 * hit_len / tot_len).toFixed(2) + '%)'); +} + /************************** *** Conversion related *** **************************/ @@ -1047,7 +1168,7 @@ function paf_splice2bed(args) { var colors = ["0,128,255", "255,0,0", "0,192,0"]; - function print_lines(a, fmt) + function print_lines(a, fmt, keep_multi) { if (a.length == 0) return; if (fmt == "bed") { @@ -1061,6 +1182,7 @@ function paf_splice2bed(args) warn("Warning: " + a[0][3] + " doesn't have a primary alignment"); } for (var i = 0; i < a.length; ++i) { + if (!keep_multi && a[i][8] == 2) continue; a[i][8] = colors[a[i][8]]; print(a[i].join("\t")); } @@ -1069,13 +1191,16 @@ function paf_splice2bed(args) } var re = /(\d+)([MIDNSH])/g; - var c, fmt = "bed", fn_name_conv = null; - while ((c = getopt(args, "f:n:")) != null) { + var c, fmt = "bed", fn_name_conv = null, keep_multi = false; + while ((c = getopt(args, "f:n:m")) != null) { if (c == 'f') fmt = getopt.arg; else if (c == 'n') fn_name_conv = getopt.arg; + else if (c == 'm') keep_multi = true; } if (getopt.ind == args.length) { - warn("Usage: paftools.js splice2bed |"); + print("Usage: paftools.js splice2bed [options] |"); + print("Options:"); + print(" -m keep multiple mappings (SAM flag 0x100)"); exit(1); } @@ -1107,7 +1232,7 @@ function paf_splice2bed(args) if (flag&1) t[0] += '/' + (flag>>6&3); } if (a.length && a[0][3] != t[0]) { - print_lines(a, fmt); + print_lines(a, fmt, keep_multi); a = []; } if (t.length >= 12 && (t[4] == '+' || t[4] == '-')) { // PAF @@ -1148,7 +1273,7 @@ function paf_splice2bed(args) a1.push(is_pri? 0 : 2, bs.length, bl.join(",")+",", bs.join(",")+","); a.push(a1); } - print_lines(a, fmt); + print_lines(a, fmt, keep_multi); buf.destroy(); file.close(); if (conv != null) conv.destroy(); @@ -1707,6 +1832,7 @@ function main(args) print(" stat collect basic mapping information in PAF/SAM"); print(" liftover simplistic liftOver"); print(" call call variants from asm-to-ref alignment with the cs tag"); + print(" bedcov compute the number of bases covered"); print(""); print(" mapeval evaluate mapping accuracy using mason2/PBSIM-simulated FASTQ"); print(" mason2fq convert mason2-simulated SAM to FASTQ"); @@ -1726,6 +1852,7 @@ function main(args) else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args); else if (cmd == 'call') paf_call(args); else if (cmd == 'mapeval') paf_mapeval(args); + else if (cmd == 'bedcov') paf_bedcov(args); else if (cmd == 'mason2fq') paf_mason2fq(args); else if (cmd == 'pbsim2fq') paf_pbsim2fq(args); else if (cmd == 'junceval') paf_junceval(args);