From 8c8d4468201db7eae28a03c002ffd028a462bd63 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 12 Nov 2017 18:41:01 -0500 Subject: [PATCH] find regions covered by one contig --- misc/paf2diff.js | 83 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/misc/paf2diff.js b/misc/paf2diff.js index a0f8990..9935ec5 100644 --- a/misc/paf2diff.js +++ b/misc/paf2diff.js @@ -37,11 +37,12 @@ var getopt = function(args, ostr) { } var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g; -var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50; -while ((c = getopt(arguments, "l:L:g:")) != null) { +var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50, min_mapq = 5; +while ((c = getopt(arguments, "l:L:g:q:")) != null) { if (c == 'l') min_cov_len = parseInt(getopt.arg); else if (c == 'L') min_var_len = parseInt(optarg.arg); else if (c == 'g') gap_thres = parseInt(optarg.arg); + else if (c == 'q') min_mapq = parseInt(optarg.arg); } if (arguments.length == getopt.ind) { @@ -49,26 +50,70 @@ if (arguments.length == getopt.ind) { print("Options:"); print(" -l INT min alignment length to compute coverage ["+min_cov_len+"]"); print(" -L INT min alignment length to call variants ["+min_var_len+"]"); - print(" -g INT short/long gap threshold ["+gap_thres+"]"); + print(" -q INT min mapping quality ["+min_mapq+"]"); + print(" -g INT short/long gap threshold (for statistics only) ["+gap_thres+"]"); exit(1); } var file = new File(arguments[getopt.ind]); var buf = new Bytes(); +var tot_len = 0, n_sub = [0, 0, 0], n_ins = [0, 0, 0, 0], n_del = [0, 0, 0, 0]; + +function count_var(o) +{ + if (o[3] > 1) return; + if (o[5] == '-' && o[6] == '-') return; + if (o[5] == '-') { // insertion + var l = o[6].length; + if (l == 1) ++n_ins[0]; + else if (l == 2) ++n_ins[1]; + else if (l < gap_thres) ++n_ins[2]; + else ++n_ins[3]; + } else if (o[6] == '-') { // deletion + var l = o[5].length; + if (l == 1) ++n_del[0]; + else if (l == 2) ++n_del[1]; + else if (l < gap_thres) ++n_del[2]; + else ++n_del[3]; + } else { + ++n_sub[0]; + var s = o[5] + o[6]; + if (s == 'ag' || s == 'ga' || s == 'ct' || s == 'tc') + ++n_sub[1]; + else ++n_sub[2]; + } +} var a = [], out = []; -var tot_len = 0, n_sub = 0, n_1_ins = 0, n_1_del = 0, n_short_ins = 0, n_short_del = 0, n_long_ins = 0, n_long_del = 0; +var c1_ctg = null, c1_start = 0, c1_end = 0, c1_counted = false, c1_len = 0; while (file.readline(buf) >= 0) { var line = buf.toString(); if (!/\ts2:i:/.test(line)) continue; // skip secondary alignments var m, t = line.split("\t", 12); for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]); - if (t[10] < min_cov_len) continue; + if (t[10] < min_cov_len || t[11] == 0) continue; var ctg = t[5], x = t[7], end = t[8]; + // compute regions covered by 1 contig + if (ctg != c1_ctg || x >= c1_end) { + if (c1_counted) + c1_len += c1_end - c1_start; + c1_ctg = ctg, c1_start = x, c1_end = end; + c1_counted = (t[10] >= min_var_len); + } else if (end > c1_end) { // overlap + if (c1_counted && x > c1_start) + c1_len += x - c1_start; + c1_start = c1_end, c1_end = end; + c1_counted = (t[10] >= min_var_len); + } else { // contained + if (c1_counted && x > c1_start) + c1_len += x - c1_start; + c1_start = end; + } // output variants ahead of this alignment while (out.length) { if (out[0][0] != ctg || out[0][2] <= x) { + count_var(out[0]); print(out[0].join("\t")); out.shift(); } else break; @@ -98,19 +143,12 @@ while (file.readline(buf) >= 0) { var l = m[1] == '='? m[2].length : parseInt(m[2]); x += l, blen += l; } else if (m[1] == '*') { - ++n_sub; out.push([t[5], x, x+1, cov, t[11], m[2].charAt(0), m[2].charAt(1)]); ++x, ++blen, ++n_diff; } else if (m[1] == '+') { - if (m[2].length == 1) ++n_1_ins; - else if (m[2].length >= gap_thres) ++n_long_ins; - else ++n_short_ins; out.push([t[5], x, x, cov, t[11], '-', m[2]]); ++blen, ++n_diff; } else if (m[1] == '-') { - if (m[2].length == 1) ++n_1_del; - else if (m[2].length >= gap_thres) ++n_long_del; - else ++n_short_del; out.push([t[5], x, x + m[2].length, cov, t[11], m[2], '-']); ++blen, ++n_diff; } @@ -118,19 +156,24 @@ while (file.readline(buf) >= 0) { } a.push([t[5], t[7], t[8]]); } +if (c1_counted) c1_len += c1_end - c1_start; while (out.length) { + count_var(out[0]); print(out[0].join("\t")); out.shift(); } -warn(tot_len + " contig bases considered in calling"); -warn(n_sub + " substitutions"); -warn(n_1_del + " 1bp deletions"); -warn(n_1_ins + " 1bp insertions"); -warn(n_short_del + " [2,"+gap_thres+") short deletions"); -warn(n_short_ins + " [2,"+gap_thres+") short insertions"); -warn(n_long_del + " >="+gap_thres+" long deletions"); -warn(n_long_ins + " >="+gap_thres+" long insertions"); +//warn(tot_len + " alignment columns considered in calling"); +warn(c1_len + " reference bases covered by exactly one contig"); +warn(n_sub[0] + " substitutions; ts/tv = " + (n_sub[1]/n_sub[2]).toFixed(3)); +warn(n_del[0] + " 1bp deletions"); +warn(n_ins[0] + " 1bp insertions"); +warn(n_del[1] + " 2bp deletions"); +warn(n_ins[1] + " 2bp insertions"); +warn(n_del[2] + " [3,"+gap_thres+") deletions"); +warn(n_ins[2] + " [3,"+gap_thres+") insertions"); +warn(n_del[3] + " >="+gap_thres+" deletions"); +warn(n_ins[3] + " >="+gap_thres+" insertions"); buf.destroy(); file.close();