From d61a1226a8ac3ad1955954658893655b402360b7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 Oct 2014 14:52:48 -0400 Subject: [PATCH 1/9] highlight Liu et al. It seems good. --- README-alt.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README-alt.md b/README-alt.md index 61839a7..35d9f4c 100644 --- a/README-alt.md +++ b/README-alt.md @@ -156,8 +156,8 @@ have ignored these important genes. We recommend to include the genomic regions of classical HLA genes in the BWA index. This way we will be able to get a more complete collection of reads mapped to HLA. We can then isolate these reads with little computational cost -and type HLA genes with another program, such as [Dilthey et al (2014)][hla1] or -one from [this list][hlatools]. +and type HLA genes with another program, such as [Liu et al (2013)][hla2], +[Dilthey et al (2014)][hla1] or one from [this list][hlatools]. If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it will also write the top three alleles to file `prefix.hla`. However, as most HLA @@ -194,3 +194,4 @@ can even get rid of ALT contigs for good. [hla1]: http://biorxiv.org/content/early/2014/07/08/006973 [hlalink]: http://www.hladiseaseassociations.com [hlatools]: https://www.biostars.org/p/93245/ +[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html From 3a1da27c1cc1fae4517da93a767b77a00ef270ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 Oct 2014 15:08:18 -0400 Subject: [PATCH 2/9] added Bai et al. Seems good, too. --- README-alt.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README-alt.md b/README-alt.md index 35d9f4c..772766e 100644 --- a/README-alt.md +++ b/README-alt.md @@ -156,14 +156,16 @@ have ignored these important genes. We recommend to include the genomic regions of classical HLA genes in the BWA index. This way we will be able to get a more complete collection of reads mapped to HLA. We can then isolate these reads with little computational cost -and type HLA genes with another program, such as [Liu et al (2013)][hla2], -[Dilthey et al (2014)][hla1] or one from [this list][hlatools]. +and type HLA genes with another program, such as [Liu et al (2013)][hla2], [Bai +et al (2014)][hla3], [Dilthey et al (2014)][hla1] or one from [this +list][hlatools]. If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it will also write the top three alleles to file `prefix.hla`. However, as most HLA alleles from IMGT/HLA don't have intronic sequences and thus are not included in -the reference genome, we are unable to type HLA genes to high resolution with -the BWA-MEM mapping alone. A dedicated tool is recommended for accurate typing. +the BWA index from option 2, we are unable to type HLA genes to high resolution +with the BWA-MEM mapping alone. A dedicated tool is recommended for accurate +typing. ### Evaluating ALT Mapping @@ -195,3 +197,4 @@ can even get rid of ALT contigs for good. [hlalink]: http://www.hladiseaseassociations.com [hlatools]: https://www.biostars.org/p/93245/ [hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html +[hla3]: http://www.biomedcentral.com/1471-2164/15/325 From 7c362bef69eba18d8064d9ae9f9c7ddded45cc00 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 Oct 2014 15:15:20 -0400 Subject: [PATCH 3/9] added another paper --- README-alt.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README-alt.md b/README-alt.md index 772766e..1ee2e9f 100644 --- a/README-alt.md +++ b/README-alt.md @@ -156,9 +156,9 @@ have ignored these important genes. We recommend to include the genomic regions of classical HLA genes in the BWA index. This way we will be able to get a more complete collection of reads mapped to HLA. We can then isolate these reads with little computational cost -and type HLA genes with another program, such as [Liu et al (2013)][hla2], [Bai -et al (2014)][hla3], [Dilthey et al (2014)][hla1] or one from [this -list][hlatools]. +and type HLA genes with another program, such as [Warren et al (2012)][hla4], +[Liu et al (2013)][hla2], [Bai et al (2014)][hla3], [Dilthey et al (2014)][hla1] +or others from [this list][hlatools]. If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it will also write the top three alleles to file `prefix.hla`. However, as most HLA @@ -198,3 +198,4 @@ can even get rid of ALT contigs for good. [hlatools]: https://www.biostars.org/p/93245/ [hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html [hla3]: http://www.biomedcentral.com/1471-2164/15/325 +[hla4]: http://genomemedicine.com/content/4/12/95 From 7a8019e6eeb5a2a842e9e505c26a4c1f7aa83e94 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 Oct 2014 14:51:05 -0400 Subject: [PATCH 4/9] removed debugging code --- bwa-postalt.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 8c4218e..7292856 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -236,8 +236,6 @@ function type_hla(w) r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]); } } - for (var i = 0; i < r.length; ++i) - print(r[i].join("\t")); return r; } From ba3ab0ca98f99ee2a8461ff070be1e97853ae512 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 Oct 2014 14:52:16 -0400 Subject: [PATCH 5/9] removed hla typing - not good enough --- bwa-postalt.js | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 7292856..3a71cde 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -203,42 +203,6 @@ function parse_hit(s, opt) return h; } -function type_hla(w) -{ - var hla = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]; - var hla_hash = {}, a = [], r = []; - for (var i = 0; i < hla.length; ++i) { - hla_hash[hla[i]] = i; - a[i] = []; - } - for (var i = 0; i < w.length; ++i) { - var t = w[i][0].split(/[:\*]/); - var x = hla_hash[t[0]]; - if (x != null) - a[x].push([w[i][0], t[1], t[1] + ':' + t[2], w[i][1]]); - } - for (var k = 1; k <= 2; ++k) { - for (var i = 0; i < hla.length; ++i) { - var ai = a[i], m = {}; - for (var j = 0; j < ai.length; ++j) { - var key = ai[j][k], val = ai[j][3]; - if (m[key] == null) m[key] = [-1, -1.0]; - if (m[key][1] < val) m[key] = [j, val]; - } - var sum = 0; - for (var x in m) sum += m[x][1]; - var max = -1, max2 = -1, max3 = -1, max_x, max_x2, max_x3; - for (var x in m) { - if (max < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = max, max_x2 = max_x, max = m[x][1], max_x = x; - else if (max2 < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = m[x][1], max_x2 = x; - else if (max3 < m[x][1]) max3 = m[x][1], max_x3 = x; - } - r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]); - } - } - return r; -} - function bwa_postalt(args) { var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 }; @@ -626,12 +590,6 @@ function bwa_postalt(args) fpout.write(weight_arr[i].join("\t") + '\n'); } fpout.close(); - - var r = type_hla(weight_hla); - fpout = new File(opt.pre + '.hla', "w"); - for (var i = 0; i < r.length; ++i) - fpout.write(r[i].join("\t") + '\n'); - fpout.close(); } } From 33938e1ed064d7be0fa97ad97ae19362e5056926 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 Oct 2014 16:54:48 -0400 Subject: [PATCH 6/9] save reads mapped to each HLA gene for downstream typing --- bwa-postalt.js | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 3a71cde..62930dd 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -203,6 +203,24 @@ function parse_hit(s, opt) return h; } +function print_buffer(buf2, fp_hla) +{ + var m, hla = {}; + if (buf2.length == 0) return; + for (var i = 0; i < buf2.length; ++i) { + print(buf2[i].join("\t")); + if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(buf2[i][2])) != null) + hla[m[1]] = true; + } + if (fp_hla != null) { + var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+'); + for (var x in hla) { + if (fp_hla[x] != null); + fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n'); + } + } +} + function bwa_postalt(args) { var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 }; @@ -247,7 +265,7 @@ function bwa_postalt(args) var buf = new Bytes(); // read ALT-to-REF alignment - var intv_alt = {}, intv_pri = {}, idx_un = {}; + var intv_alt = {}, intv_pri = {}, idx_un = {}, hla_ctg = {}; var file = new File(args[getopt.ind]); while (file.readline(buf) >= 0) { var line = buf.toString(); @@ -261,6 +279,10 @@ function bwa_postalt(args) continue; } var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; + if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs + if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0; + ++hla_ctg[m[1]]; + } while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip @@ -283,6 +305,14 @@ function bwa_postalt(args) for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); + // initialize the list of HLA contigs + var fp_hla = null; + if (opt.pre) { + fp_hla = {}; + for (var h in hla_ctg) + fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w"); + } + // initialize the list of ALT contigs var weight_alt = []; for (var ctg in idx_alt) @@ -305,8 +335,7 @@ function bwa_postalt(args) // print bufferred reads if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { - for (var i = 0; i < buf2.length; ++i) - print(buf2[i].join("\t")); + print_buffer(buf2, fp_hla); buf2 = []; } @@ -562,10 +591,12 @@ function bwa_postalt(args) buf2.push(s); } } - for (var i = 0; i < buf2.length; ++i) - print(buf2[i].join("\t")); + print_buffer(buf2, fp_hla); file.close(); if (fp_evi != null) fp_evi.close(); + if (fp_hla != null) + for (var h in fp_hla) + fp_hla[h].close(); buf.destroy(); aux.destroy(); From 7327f8fa10f412c23eaa764444afd55bf521b992 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 25 Oct 2014 12:00:15 -0400 Subject: [PATCH 7/9] output all overlapping HLA hits A contig longer than HLA genes may not get hits to HLA and thus won't be used in typing. --- bwa-postalt.js | 54 ++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 62930dd..60d718c 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -203,15 +203,9 @@ function parse_hit(s, opt) return h; } -function print_buffer(buf2, fp_hla) +function print_buffer(buf2, fp_hla, hla) { - var m, hla = {}; if (buf2.length == 0) return; - for (var i = 0; i < buf2.length; ++i) { - print(buf2[i].join("\t")); - if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(buf2[i][2])) != null) - hla[m[1]] = true; - } if (fp_hla != null) { var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+'); for (var x in hla) { @@ -321,7 +315,7 @@ function bwa_postalt(args) weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0]; // process SAM - var buf2 = []; + var buf2 = [], hla = {}; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); while (file.readline(buf) >= 0) { var m, line = buf.toString(); @@ -335,8 +329,8 @@ function bwa_postalt(args) // print bufferred reads if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { - print_buffer(buf2, fp_hla); - buf2 = []; + print_buffer(buf2, fp_hla, hla); + buf2 = [], hla = {}; } // skip unmapped lines @@ -459,27 +453,35 @@ function bwa_postalt(args) else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; + var pri_ofunc = idx_pri[hits[reported_i].pctg], ovlp_alt = []; + if (pri_ofunc != null) { + var rpt_start = 1<<30, rpt_end = 0; + for (var i = 0; i < hits.length; ++i) { + var h = hits[i]; + if (h.g == reported_g) { + rpt_start = rpt_start < h.pstart? rpt_start : h.pstart; + rpt_end = rpt_end > h.pend ? rpt_end : h.pend; + } + } + ovlp_alt = pri_ofunc(rpt_start, rpt_end); + for (var i = 0; i < ovlp_alt.length; ++i) + if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null) + hla[m[1]] = true; + } + // ALT genotyping if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) { // collect all overlapping ALT contigs - var hits2 = []; + var alts = {}; for (var i = 0; i < hits.length; ++i) { var h = hits[i]; - if (h.g == reported_g) - hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]); + if (h.g == reported_g && weight_alt[h.ctg] != null) + alts[h.ctg] = [h.score, h.NM]; } - var start = hits2[0][1], end = hits2[0][2]; - for (var i = 1; i < hits2.length; ++i) - end = end > hits2[i][2]? end : hits2[i][2]; - var alts = {}; - for (var i = 0; i < hits2.length; ++i) - if (weight_alt[hits2[i][3]] != null) - alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]]; - if (idx_pri[hits2[0][0]] != null) { // add other unreported hits - var ovlp = idx_pri[hits2[0][0]](start, end); - for (var i = 0; i < ovlp.length; ++i) - if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null) - alts[ovlp[i][2]] = [0, 0]; + if (ovlp_alt.length > 0) { // add other unreported hits + for (var i = 0; i < ovlp_alt.length; ++i) + if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1]) + alts[ovlp_alt[i][2]] = [0, 0]; } // add weight to each ALT contig @@ -591,7 +593,7 @@ function bwa_postalt(args) buf2.push(s); } } - print_buffer(buf2, fp_hla); + print_buffer(buf2, fp_hla, hla); file.close(); if (fp_evi != null) fp_evi.close(); if (fp_hla != null) From 2bfcc421d92178e4440631bcd0e9f878bc545f07 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 25 Oct 2014 22:13:31 -0400 Subject: [PATCH 8/9] bugfix: alignments not outputted --- bwa-postalt.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bwa-postalt.js b/bwa-postalt.js index 60d718c..1e8bd06 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -206,6 +206,8 @@ function parse_hit(s, opt) function print_buffer(buf2, fp_hla, hla) { if (buf2.length == 0) return; + for (var i = 0; i < buf2.length; ++i) + print(buf2[i].join("\t")); if (fp_hla != null) { var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+'); for (var x in hla) { From 0f8a164bd39824a3cd25a8521e54b3d7b60abada Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 25 Oct 2014 23:08:29 -0400 Subject: [PATCH 9/9] bugfix: .ctw not generated properly --- bwa-postalt.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 1e8bd06..b538552 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -482,7 +482,7 @@ function bwa_postalt(args) } if (ovlp_alt.length > 0) { // add other unreported hits for (var i = 0; i < ovlp_alt.length; ++i) - if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1]) + if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1] && alts[ovlp_alt[i][2]] == null) alts[ovlp_alt[i][2]] = [0, 0]; }