From 77ebd479f4a40b09156994ab9a9695f115f44753 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 12 Mar 2018 21:51:31 -0400 Subject: [PATCH] r743: added VCF output to "call" --- misc/paftools.js | 89 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 8 deletions(-) diff --git a/misc/paftools.js b/misc/paftools.js index 7471910..add795c 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = 'r734'; +var paftools_version = 'r743'; /***************************** ***** Library functions ***** @@ -131,6 +131,40 @@ Interval.find_ovlp = function(a, st, en) * Reverse and reverse complement * **********************************/ +function fasta_read(fn) +{ + var h = {}, gt = '>'.charCodeAt(0); + var file = fn == '-'? new File() : new File(fn); + var buf = new Bytes(), seq = null, name = null, seqlen = []; + while (file.readline(buf) >= 0) { + if (buf[0] == gt) { + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = seq; + name = seq = null; + } + var m, line = buf.toString(); + if ((m = /^>(\S+)/.exec(line)) != null) { + name = m[1]; + seq = new Bytes(); + } + } else seq.set(buf); + } + if (seq != null && name != null) { + seqlen.push([name, seq.length]); + h[name] = seq; + } + buf.destroy(); + file.close(); + return [h, seqlen]; +} + +function fasta_free(fa) +{ + for (var name in fa) + fa[name].destroy(); +} + Bytes.prototype.reverse = function() { for (var i = 0; i < this.length>>1; ++i) { @@ -307,12 +341,15 @@ function paf_call(args) { var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g, re_tag = /\t(\S\S:[AZif]):(\S+)/g; var c, min_cov_len = 10000, min_var_len = 50000, gap_thres = 50, min_mapq = 5; - while ((c = getopt(args, "l:L:g:q:B:")) != null) { + var fa_tmp = null, fa, fa_lens, is_vcf = false; + while ((c = getopt(args, "l:L:g:q:B:f:")) != null) { if (c == 'l') min_cov_len = parseInt(getopt.arg); else if (c == 'L') min_var_len = parseInt(getopt.arg); else if (c == 'g') gap_thres = parseInt(getopt.arg); else if (c == 'q') min_mapq = parseInt(getopt.arg); + else if (c == 'f') fa_tmp = fasta_read(getopt.arg, fa_lens); } + if (fa_tmp != null) fa = fa_tmp[0], fa_lens = fa_tmp[1], is_vcf = true; if (args.length == getopt.ind) { print("Usage: sort -k6,6 -k8,8n | paftools.js call [options] -"); @@ -321,6 +358,7 @@ function paf_call(args) print(" -L INT min alignment length to call variants ["+min_var_len+"]"); print(" -q INT min mapping quality ["+min_mapq+"]"); print(" -g INT short/long gap threshold (for statistics only) ["+gap_thres+"]"); + print(" -f FILE reference sequences (enabling VCF output) [null]"); exit(1); } @@ -328,6 +366,27 @@ function paf_call(args) var buf = new Bytes(); var tot_len = 0, n_sub = [0, 0, 0], n_ins = [0, 0, 0, 0], n_del = [0, 0, 0, 0]; + function print_vcf(o, fa) + { + var v = null; + if (o[3] != 1) return; // coverage is one; skip + if (o[5] == '-' && o[6] == '-') return; + if (o[5] != '-' && o[6] != '-') { // snp + v = [o[0], o[1] + 1, '.', o[5].toUpperCase(), o[6].toUpperCase()]; + } else if (o[1] > 0) { // shouldn't happen in theory + if (fa[o[0]] == null) throw Error('sequence "' + o[0] + '" is absent from the reference FASTA'); + if (o[1] >= fa[o[0]].length) throw Error('position ' + o[1] + ' exceeds the length of sequence "' + o[0] + '"'); + var ref = String.fromCharCode(fa[o[0]][o[1]-1]).toUpperCase(); + if (o[5] == '-') // insertion + v = [o[0], o[1], '.', ref, ref + o[6].toUpperCase()]; + else // deletion + v = [o[0], o[1], '.', ref + o[5].toUpperCase(), ref]; + } + v.push(o[4], '.', 'QNAME=' + o[7] + ';QSTART=' + (o[8]+1) + ';QSTRAND=' + (rev? '-' : '+'), 'GT', '1/1'); + if (v == null) throw Error("unexpected variant: [" + o.join(",") + "]"); + print(v.join("\t")); + } + function count_var(o) { if (o[3] > 1) return; @@ -353,6 +412,17 @@ function paf_call(args) } } + if (is_vcf) { + print('##fileformat=VCFv4.1'); + for (var i = 0; i < fa_lens.length; ++i) + print('##contig='); + print('##INFO='); + print('##INFO='); + print('##INFO='); + print('##FORMAT='); + print('#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample'); + } + var a = [], out = []; var c1_ctg = null, c1_start = 0, c1_end = 0, c1_counted = false, c1_len = 0; while (file.readline(buf) >= 0) { @@ -380,21 +450,21 @@ function paf_call(args) if (ctg != c1_ctg || x >= c1_end) { if (c1_counted && c1_end > c1_start) { c1_len += c1_end - c1_start; - print('R', c1_ctg, c1_start, c1_end); + if (!is_vcf) print('R', c1_ctg, c1_start, c1_end); } c1_ctg = ctg, c1_start = x, c1_end = end; c1_counted = (t[10] >= min_var_len); } else if (end > c1_end) { // overlap if (c1_counted && x > c1_start) { c1_len += x - c1_start; - print('R', c1_ctg, c1_start, x); + if (!is_vcf) print('R', c1_ctg, c1_start, x); } c1_start = c1_end, c1_end = end; c1_counted = (t[10] >= min_var_len); } else if (end > c1_start) { // contained if (c1_counted && x > c1_start) { c1_len += x - c1_start; - print('R', c1_ctg, c1_start, x); + if (!is_vcf) print('R', c1_ctg, c1_start, x); } c1_start = end; } // else, the alignment precedes the cov1 region; do nothing @@ -402,7 +472,8 @@ function paf_call(args) while (out.length) { if (out[0][0] != ctg || out[0][2] <= x) { count_var(out[0]); - print('V', out[0].join("\t")); + if (is_vcf) print_vcf(out[0], fa); + else print('V', out[0].join("\t")); out.shift(); } else break; } @@ -458,11 +529,12 @@ function paf_call(args) } if (c1_counted && c1_end > c1_start) { c1_len += c1_end - c1_start; - print('R', c1_ctg, c1_start, c1_end); + if (!is_vcf) print('R', c1_ctg, c1_start, c1_end); } while (out.length) { count_var(out[0]); - print('V', out[0].join("\t")); + if (is_vcf) print_vcf(out[0], fa); + else print('V', out[0].join("\t")); out.shift(); } @@ -480,6 +552,7 @@ function paf_call(args) buf.destroy(); file.close(); + if (fa != null) fasta_free(fa); } function paf_stat(args)