#!/usr/bin/env k8 var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } var c, gap_out_len = null; while ((c = getopt(arguments, "l:")) != null) if (c == 'l') gap_out_len = parseInt(getopt.arg); if (getopt.ind == arguments.length) { print("Usage: k8 mapstat.js [-l gapOutLen] |"); exit(1); } var buf = new Bytes(); var file = new File(arguments[getopt.ind]); var re = /(\d+)([MIDSHNX=])/g; var lineno = 0, n_pri = 0, n_2nd = 0, n_seq = 0, n_cigar_64k = 0, l_tot = 0, l_cov = 0; var n_gap = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]; function cov_len(regs) { regs.sort(function(a,b) {return a[0]-b[0]}); var st = regs[0][0], en = regs[0][1], l = 0; for (var i = 1; i < regs.length; ++i) { if (regs[i][0] < en) en = en > regs[i][1]? en : regs[i][1]; else l += en - st, st = regs[i][0], en = regs[i][1]; } l += en - st; return l; } var last = null, last_qlen = null, regs = []; while (file.readline(buf) >= 0) { var line = buf.toString(); ++lineno; if (line.charAt(0) != '@') { var t = line.split("\t", 12); var m, rs, cigar = null, is_pri = false, is_sam = false, is_rev = false, tname = null; var atlen = null, aqlen, qs, qe, mapq, ori_qlen; if (t[4] == '+' || t[4] == '-') { // PAF if (!/\ts2:i:\d+/.test(line)) { ++n_2nd; continue; } if ((m = /\tcg:Z:(\S+)/.exec(line)) != null) cigar = m[1]; if (cigar == null) { warn("WARNING: no CIGAR at line " + lineno); continue; } tname = t[5]; qs = parseInt(t[2]), qe = parseInt(t[3]); aqlen = qe - qs; is_rev = t[4] == '+'? false : true; rs = parseInt(t[7]); atlen = parseInt(t[8]) - rs; mapq = parseInt(t[11]); ori_qlen = parseInt(t[1]); } else { // SAM var flag = parseInt(t[1]); if ((flag & 4) || t[2] == '*' || t[5] == '*') continue; if (flag & 0x100) { ++n_2nd; continue; } cigar = t[5]; tname = t[2]; rs = parseInt(t[3]) - 1; mapq = parseInt(t[4]); aqlen = t[9].length; is_sam = true; is_rev = !!(flag&0x10); } ++n_pri; if (last != t[0]) { if (last != null) { l_tot += last_qlen; l_cov += cov_len(regs); } regs = []; ++n_seq, last = t[0]; } var M = 0, tl = 0, ql = 0, clip = [0, 0], n_cigar = 0, sclip = 0; while ((m = re.exec(cigar)) != null) { var l = parseInt(m[1]); ++n_cigar; if (m[2] == 'M' || m[2] == '=' || m[2] == 'X') { tl += l, ql += l, M += l; } else if (m[2] == 'I' || m[2] == 'D') { var type; if (l < 50) type = 0; else if (l < 100) type = 1; else if (l < 300) type = 2; else if (l < 400) type = 3; else if (l < 1000) type = 4; else type = 5; if (m[2] == 'I') ql += l, ++n_gap[0][type]; else tl += l, ++n_gap[1][type]; if (gap_out_len != null && l >= gap_out_len) print(t[0], ql, is_rev? '-' : '+', tname, rs + tl, m[2], l); } else if (m[2] == 'N') { tl += l; } else if (m[2] == 'S') { clip[M == 0? 0 : 1] = l, sclip += l; } else if (m[2] == 'H') { clip[M == 0? 0 : 1] = l; } } if (n_cigar > 65535) ++n_cigar_64k; if (ql + sclip != aqlen) warn("WARNING: aligned query length is inconsistent with CIGAR at line " + lineno + " (" + (ql+sclip) + " != " + aqlen + ")"); if (atlen != null && atlen != tl) warn("WARNING: aligned reference length is inconsistent with CIGAR at line " + lineno); if (is_sam) { qs = clip[is_rev? 1 : 0], qe = qs + ql; ori_qlen = clip[0] + ql + clip[1]; } regs.push([qs, qe]); last_qlen = ori_qlen; } } l_tot += last_qlen; l_cov += cov_len(regs); file.close(); buf.destroy(); if (gap_out_len == null) { print("Number of mapped sequences: " + n_seq); print("Number of primary alignments: " + n_pri); print("Number of secondary alignments: " + n_2nd); print("Number of primary alignments with >65535 CIGAR operations: " + n_cigar_64k); print("Number of bases in mapped sequences: " + l_tot); print("Number of mapped bases: " + l_cov); print("Number of insertions in [0,50): " + n_gap[0][0]); print("Number of insertions in [50,100): " + n_gap[0][1]); print("Number of insertions in [100,300): " + n_gap[0][2]); print("Number of insertions in [300,400): " + n_gap[0][3]); print("Number of insertions in [400,1000): " + n_gap[0][4]); print("Number of insertions in [1000,inf): " + n_gap[0][5]); print("Number of deletions in [0,50): " + n_gap[1][0]); print("Number of deletions in [50,100): " + n_gap[1][1]); print("Number of deletions in [100,300): " + n_gap[1][2]); print("Number of deletions in [300,400): " + n_gap[1][3]); print("Number of deletions in [400,1000): " + n_gap[1][4]); print("Number of deletions in [1000,inf): " + n_gap[1][5]); }