diff --git a/bwa-typeHLA.js b/bwa-typeHLA.js index da9ca28..5561952 100644 --- a/bwa-typeHLA.js +++ b/bwa-typeHLA.js @@ -1,6 +1,72 @@ -var fuzzy = 3, min_qal = 100, drop_thres = 7; +/***************************************************************** + * The K8 Javascript interpreter is required to run this script. * + * * + * Source code: https://github.com/attractivechaos/k8 * + * Binary: http://sourceforge.net/projects/lh3/files/k8/ * + *****************************************************************/ -var file = arguments.length == 0? new File () : new File(arguments[0]); +var getopt = function(args, ostr) { + var oli; // option letter list index + if (typeof(getopt.place) == 'undefined') + getopt.ind = 0, getopt.arg = null, getopt.place = -1; + if (getopt.place == -1) { // update scanning pointer + if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { + getopt.place = -1; + return null; + } + if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" + ++getopt.ind; + getopt.place = -1; + return null; + } + } + var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity + if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { + if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. + if (getopt.place < 0) ++getopt.ind; + return '?'; + } + if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument + getopt.arg = null; + if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; + } else { // need an argument + if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) + getopt.arg = args[getopt.ind].substr(getopt.place); + else if (args.length <= ++getopt.ind) { // no arg + getopt.place = -1; + if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; + return '?'; + } else getopt.arg = args[getopt.ind]; // white space + getopt.place = -1; + ++getopt.ind; + } + return optopt; +} + +/********************* + *** Main function *** + *********************/ + +var c, fuzzy = 1, min_qal = 100, drop_thres = 7; + +// parse command line options +while ((c = getopt(arguments, "f:m:d:")) != null) { + if (c == 'f') fuzzy = parseInt(getopt.arg); + else if (c == 'm') min_qal = parseInt(getopt.arg); + else if (c == 'd') drop_thres = parseInt(getopt.arg); +} +if (arguments.length == getopt.ind) { + print(""); + print("Usage: k8 bwa-typeHLA.js [options] \n"); + print("Options: -d INT drop a contig if the edit distance to the closest gene is >INT ["+drop_thres+"]"); + print(" -m INT ignore a hit if the gene sequence in the alignment is = 0) { var m, mm, line = buf.toString(); var t = line.split("\t"); + // SAM header if (t[0].charAt(0) == '@') { if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null) len[mm[1]] = parseInt(m[1]); continue; } + // parse gene name and exon number var gene = null, exon = null; if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) { gene = m[1], exon = parseInt(m[2]) - 1; @@ -20,6 +88,7 @@ while (file.readline(buf) >= 0) { gcnt[exon][gene] = true; } if (gene == null || exon == null || t[2] == '*') continue; + // parse clipping and aligned length var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0]; while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); @@ -33,16 +102,17 @@ while (file.readline(buf) >= 0) { var left = ts < clip[0]? ts : clip[0]; var right = tl - te < clip[1]? tl - te : clip[1]; var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; - list.push([t[2], gene, exon, ts, te, nm, left + right]); + list.push([t[2], gene, exon, ts, te, nm, left + right]); // left+right should be 0 given a prefix-suffix alignment } buf.destroy(); file.close(); -// identify the primary exon(s) +// identify the primary exons, the exons associated with most genes var pri_exon = [], n_pri_exons; { var cnt = [], max = 0; + // count the number of genes per exon and track the max for (var e = 0; e < gcnt.length; ++e) { if (gcnt[e] != null) { var c = 0, h = gcnt[e]; @@ -51,6 +121,7 @@ var pri_exon = [], n_pri_exons; max = max > c? max : c; } else cnt[e] = 0; } + // find primary exons var pri_list = []; for (var e = 0; e < cnt.length; ++e) { if (cnt[e] == max) pri_list.push(e + 1); @@ -76,7 +147,7 @@ for (var i = 0; i < list.length; ++i) { elist[g][list[i][2]] = true; } -// change to the per-exon representation +// reorganize hits to exons var exons = []; for (var i = 0; i < list.length; ++i) { var li = list[i]; @@ -94,10 +165,9 @@ for (var i = 0; i < glist.length; ++i) { // type each exon for (var e = 0; e < exons.length; ++e) { -//for (var e = 1; e < 3; ++e) { if (exons[e] == null) continue; var ee = exons[e]; - // find good contigs and alleles that have good matches + // find contigs and genes associated with the current exon var ch = {}, gh = {}; for (var i = 0; i < ee.length; ++i) { if (elist[ee[i][1]][e] != null) @@ -108,7 +178,7 @@ for (var e = 0; e < exons.length; ++e) { for (var g in gh) ga.push(parseInt(g)); var named_ca = []; for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]); - warn("Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(",")+ "])..."); + warn("Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])..."); // convert representation again var sc = []; for (var i = 0; i < ee.length; ++i) {