diff --git a/bwa-postalt.js b/bwa-postalt.js index 6cf3dc7..96cc6ab 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -52,31 +52,6 @@ var getopt = function(args, ostr) { return optopt; } -// print an object in a format similar to JSON. For debugging only. -function obj2str(o) -{ - if (typeof(o) != 'object') { - return o.toString(); - } else if (o == null) { - return "null"; - } else if (Array.isArray(o)) { - var s = "["; - for (var i = 0; i < o.length; ++i) { - if (i) s += ','; - s += obj2str(o[i]); - } - return s + "]"; - } else { - var i = 0, s = "{"; - for (var key in o) { - if (i++) s += ','; - s += key + ":"; - s += obj2str(o[key]); - } - return s + "}"; - } -} - // reverse a string Bytes.prototype.reverse = function() { @@ -219,32 +194,19 @@ function print_buffer(buf2, fp_hla, hla) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 }; + var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 }; - while ((c = getopt(args, 'Pqev:p:r:')) != null) { - if (c == 'v') opt.verbose = parseInt(getopt.arg); - else if (c == 'p') opt.pre = getopt.arg; - else if (c == 'P') opt.show_pri = true; - else if (c == 'q') opt.recover_maq = false; - else if (c == 'e') opt.show_ev = true; + while ((c = getopt(args, 'p:r:')) != null) { + if (c == 'p') opt.pre = getopt.arg; else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg); } if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.; - if (opt.show_ev && opt.pre == null) { - warn("ERROR: option '-p' must be specified if '-e' is applied."); - exit(1); - } - if (args.length == getopt.ind) { print(""); print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n"); - print("Options: -p STR prefix of file(s) for additional information [null]"); - print(" PREFIX.ctw - weight of each ALT contig"); - print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)"); - print(" -e show reads supporting ALT contigs into file PREFIX.evi"); + print("Options: -p STR prefix of files matching HLA genes [null]"); print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa= 0) { var line = buf.toString(); if (line.charAt(0) == '@') continue; var t = line.split("\t"); if (t.length < 11) continue; // incomplete lines + is_alt[t[0]] = true; var pos = parseInt(t[3]) - 1; var flag = parseInt(t[1]); - if ((flag&4) || t[2] == '*') { - idx_un[t[0]] = true; - continue; - } + if ((flag&4) || t[2] == '*') continue; var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0; @@ -296,10 +255,8 @@ function bwa_postalt(args) } file.close(); var idx_alt = {}, idx_pri = {}; - for (var ctg in intv_alt) - idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); - for (var ctg in intv_pri) - idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); + for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); + for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); // initialize the list of HLA contigs var fp_hla = null; @@ -309,18 +266,12 @@ function bwa_postalt(args) fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w"); } - // initialize the list of ALT contigs - var weight_alt = []; - for (var ctg in idx_alt) - weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]]; - for (var ctg in idx_un) - weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0]; - // process SAM var buf2 = [], hla = {}; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); while (file.readline(buf) >= 0) { var m, line = buf.toString(); + if (line.charAt(0) == '@') { // print and then skip the header line print(line); continue; @@ -362,7 +313,7 @@ function bwa_postalt(args) // check if there are ALT hits var has_alt = false; for (var i = 0; i < hits.length; ++i) - if (weight_alt[hits[i].ctg] != null) { + if (is_alt[hits[i].ctg] != null) { has_alt = true; break; } @@ -429,7 +380,7 @@ function bwa_postalt(args) if (hits[i].g == reported_g) ++n_group0; } else { - if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT + if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT buf2.push(t); continue; } @@ -455,7 +406,8 @@ function bwa_postalt(args) else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; - var pri_ofunc = idx_pri[hits[reported_i].pctg], ovlp_alt = []; + // find out whether the read is overlapping HLA genes + var pri_ofunc = idx_pri[hits[reported_i].pctg]; if (pri_ofunc != null) { var rpt_start = 1<<30, rpt_end = 0; for (var i = 0; i < hits.length; ++i) { @@ -465,70 +417,29 @@ function bwa_postalt(args) rpt_end = rpt_end > h.pend ? rpt_end : h.pend; } } - ovlp_alt = pri_ofunc(rpt_start, rpt_end); + var ovlp_alt = pri_ofunc(rpt_start, rpt_end); for (var i = 0; i < ovlp_alt.length; ++i) if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null) hla[m[1]] = true; } - // ALT genotyping - if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) { - // collect all overlapping ALT contigs - var alts = {}; - for (var i = 0; i < hits.length; ++i) { - var h = hits[i]; - if (h.g == reported_g && weight_alt[h.ctg] != null) - alts[h.ctg] = [h.score, h.NM]; - } - if (ovlp_alt.length > 0) { // add other unreported hits - for (var i = 0; i < ovlp_alt.length; ++i) - if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1] && alts[ovlp_alt[i][2]] == null) - alts[ovlp_alt[i][2]] = [0, 0]; - } - - // add weight to each ALT contig - var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1; - for (var ctg in alts) - alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]); - for (var i = 0; i < alt_arr.length; ++i) { - if (max_sc < alt_arr[i][1]) - max_sc = alt_arr[i][1], max_i = i; - min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1]; - var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc; - max_nm = max_nm > nm? max_nm : nm; - } - if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc; - if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) { - for (var i = 0; i < alt_arr.length; ++i) - sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc))); - for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; - for (var i = 0; i < alt_arr.length; ++i) { - var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]]; - var w = weight_alt[e[0]]; - for (var j = 0; j < 6; ++j) w[j] += e[j+1]; - if (fp_evi) { - e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3); - e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3); - fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n'); - } - } - } - } - - // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality - if (opt.update_mapq && mapQ > 0 && n_rpt_lifted <= 1) { + // adjust the mapQ of the primary hits + if (n_rpt_lifted <= 1) { var l = n_rpt_lifted == 1? rpt_lifted : null; for (var i = 0; i < buf2.length; ++i) { var s = buf2[i], is_ovlp = true; if (l != null) { if (l[0] != s[2]) is_ovlp = false; // different chr - if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand - var start = s[3] - 1, end = start; - while ((m = re_cigar.exec(t[5])) != null) - if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') - end += parseInt(m[1]); - if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap + else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand + else { + var start = s[3] - 1, end = start; + while ((m = re_cigar.exec(t[5])) != null) + if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') + end += parseInt(m[1]); + if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap + } } else is_ovlp = false; + // get the "pa" tag if present var om = -1, pa = 10.; for (var j = 11; j < s.length; ++j) if ((m = /^om:i:(\d+)/.exec(s[j])) != null) @@ -555,20 +466,6 @@ function bwa_postalt(args) } } - // generate reversed quality and reverse-complemented sequence if necessary - var rs = null, rq = null; // reversed quality and reverse complement sequence - var need_rev = false; - for (var i = 0; i < hits.length; ++i) { - if (hits[i].g != reported_g || i == reported_i) continue; - if (hits[i].rev != hits[reported_i].rev) - need_rev = true; - } - if (need_rev) { // reverse and reverse complement - aux.length = 0; - aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); - aux.set(t[10],0); aux.reverse(); rq = aux.toString(); - } - // stage the reported hit t[4] = mapQ; if (n_group0 > 1) t.push("om:i:"+ori_mapQ); @@ -576,18 +473,22 @@ function bwa_postalt(args) buf2.push(t); // stage the hits generated from the XA tag - var cnt = 0; + var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null; for (var i = 0; i < hits.length; ++i) { - if (opt.verbose >= 5) print(obj2str(hits[i])); if (hits[i].g != reported_g || i == reported_i) continue; - if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue; + if (idx_alt[hits[i].ctg] == null) continue; var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]]; // print sequence/quality and set the rev flag if (hits[i].rev == hits[reported_i].rev) { s.push(t[9], t[10]); s[1] = flag | 0x800; - } else { + } else { // we need to write the reverse sequence + if (rs == null || rq == null) { + aux.length = 0; + aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); + aux.set(t[10],0); aux.reverse(); rq = aux.toString(); + } s.push(rs, rq); s[1] = (flag ^ 0x10) | 0x800; } @@ -599,35 +500,12 @@ function bwa_postalt(args) } print_buffer(buf2, fp_hla, hla); file.close(); - if (fp_evi != null) fp_evi.close(); if (fp_hla != null) for (var h in fp_hla) fp_hla[h].close(); buf.destroy(); aux.destroy(); - - // print weight of each contig - if (opt.pre != null) { - var fpout = new File(opt.pre + '.ctw', "w"); - var weight_arr = []; - var weight_hla = []; - for (var ctg in weight_alt) { - var w = weight_alt[ctg]; - weight_arr.push([ctg, w[6], w[7], w[8], - w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', - w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']); - weight_hla.push([ctg, w[4] > 0? w[5]/w[4] : 0]); - } - weight_arr.sort(function(a,b) { - return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; - }); - for (var i = 0; i < weight_arr.length; ++i) { - if (weight_arr[i][1] == '~') weight_arr[i][1] = '*'; - fpout.write(weight_arr[i].join("\t") + '\n'); - } - fpout.close(); - } } bwa_postalt(arguments);