From eef1cee9b748a81a1b568a917432cb882000de75 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 Dec 2018 18:53:03 -0500 Subject: [PATCH] r893: added paftools.js vcfpair --- misc/paftools.js | 102 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/misc/paftools.js b/misc/paftools.js index 518102c..744e877 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.14-r887-dirty'; +var paftools_version = '2.14-r893-dirty'; /***************************** ***** Library functions ***** @@ -1197,6 +1197,105 @@ function paf_bedcov(args) warn("# target bases overlapping regions: " + hit_len + ' (' + (100.0 * hit_len / tot_len).toFixed(2) + '%)'); } +function paf_vcfpair(args) +{ + var c, is_male = false, sample = 'syndip', hgver = null; + var PAR = { '37':[[0, 2699520], [154931043, 155260560]] }; + while ((c = getopt(args, "ms:g:")) != null) { + if (c == 'm') is_male = true; + else if (c == 's') sample = getopt.arg; + else if (c == 'g') hgver = getopt.arg; + } + if (is_male && (hgver == null || PAR[hgver] == null)) + throw("for a male, -g must be specified to properly handle PARs on chrX"); + + if (getopt.ind == args.length) { + print("Usage: paftools.js vcfpair [options] "); + print("Options:"); + print(" -m the sample is male"); + print(" -g STR human genome version '37' []"); + print(" -s STR sample name [" + sample + "]"); + exit(1); + } + + var re_ctg = is_male? /^(chr)?([0-9]+|X|Y)$/ : /^(chr)?([0-9]+|X)$/; + var label = ['1', '2']; + var buf = new Bytes(); + var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + if (line.charAt(0) == '#') { + if (/^##(source|reference)=/.test(line)) continue; + if ((m = /^##contig=.*ID=([^\s,]+)/.exec(line)) != null) { + if (!re_ctg.test(m[1])) continue; + } else if (/^#CHROM/.test(line)) { + var t = line.split("\t"); + --t.length; + t[t.length-1] = sample; + line = t.join("\t"); + print('##FILTER='); + print('##FILTER='); + print('##FILTER='); + print('##FILTER='); + } + print(line); + continue; + } + var t = line.split("\t"); + if (!re_ctg.test(t[0])) continue; + var GT = null, AD = null, FILTER = [], HT = [null, null]; + for (var i = 0; i < 2; ++i) { + if ((m = /^(\.|[0-9]+)\/(\.|[0-9]+):(\S+)/.exec(t[9+i])) == null) { + warn(line); + throw Error("malformatted VCF"); + } + var s = m[3].split(","); + if (AD == null) { + AD = []; + for (var j = 0; j < s.length; ++j) + AD[j] = 0; + } + for (var j = 0; j < s.length; ++j) + AD[j] += parseInt(s[j]); + if (m[1] == '.') { + FILTER.push('GAP' + label[i]); + HT[i] = '.'; + } else if (m[1] != m[2]) { + FILTER.push('HET' + label[i]); + HT[i] = '.'; + } else HT[i] = m[1]; + } + --t.length; + // test if this is in a haploid region + var hap = 0, st = parseInt(t[1]), en = st + t[3].length; + if (is_male) { + if (/^(chr)?X/.test(t[0])) { + if (hgver != null && PAR[hgver] != null) { + var r = PAR[hgver], in_par = false; + for (var i = 0; i < r.length; ++i) + if (r[i][0] <= st && en <= r[i][1]) + in_par = true; + hap = in_par? 0 : 2; + } + } else if (/^(chr)?Y/.test(t[0])) { + hap = 1; + } + } + // special treatment for haploid regions + if (hap > 0 && FILTER.length == 1) { + if ((hap == 2 && FILTER[0] == "GAP1") || (hap == 1 && FILTER[0] == "GAP2")) + FILTER.length = 0; + } + // update VCF + t[5] = 30; // fake QUAL + t[6] = FILTER.length? FILTER.join(";") : "."; + t[9] = HT.join("|") + ":" + AD.join(","); + print(t.join("\t")); + } + file.close(); + buf.destroy(); +} + /********************** * Conversion related * **********************/ @@ -2396,6 +2495,7 @@ function main(args) else if (cmd == 'asmstat') paf_asmstat(args); else if (cmd == 'asmgene') paf_asmgene(args); else if (cmd == 'liftover' || cmd == 'liftOver') paf_liftover(args); + else if (cmd == 'vcfpair') paf_vcfpair(args); else if (cmd == 'call') paf_call(args); else if (cmd == 'mapeval') paf_mapeval(args); else if (cmd == 'bedcov') paf_bedcov(args);