Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-25 23:10:25 -04:00
commit dbe74ca9b8
2 changed files with 61 additions and 65 deletions

View File

@ -156,14 +156,16 @@ have ignored these important genes.
We recommend to include the genomic regions of classical HLA genes in the BWA We recommend to include the genomic regions of classical HLA genes in the BWA
index. This way we will be able to get a more complete collection of reads index. This way we will be able to get a more complete collection of reads
mapped to HLA. We can then isolate these reads with little computational cost mapped to HLA. We can then isolate these reads with little computational cost
and type HLA genes with another program, such as [Dilthey et al (2014)][hla1] or and type HLA genes with another program, such as [Warren et al (2012)][hla4],
one from [this list][hlatools]. [Liu et al (2013)][hla2], [Bai et al (2014)][hla3], [Dilthey et al (2014)][hla1]
or others from [this list][hlatools].
If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it
will also write the top three alleles to file `prefix.hla`. However, as most HLA will also write the top three alleles to file `prefix.hla`. However, as most HLA
alleles from IMGT/HLA don't have intronic sequences and thus are not included in alleles from IMGT/HLA don't have intronic sequences and thus are not included in
the reference genome, we are unable to type HLA genes to high resolution with the BWA index from option 2, we are unable to type HLA genes to high resolution
the BWA-MEM mapping alone. A dedicated tool is recommended for accurate typing. with the BWA-MEM mapping alone. A dedicated tool is recommended for accurate
typing.
### Evaluating ALT Mapping ### Evaluating ALT Mapping
@ -194,3 +196,6 @@ can even get rid of ALT contigs for good.
[hla1]: http://biorxiv.org/content/early/2014/07/08/006973 [hla1]: http://biorxiv.org/content/early/2014/07/08/006973
[hlalink]: http://www.hladiseaseassociations.com [hlalink]: http://www.hladiseaseassociations.com
[hlatools]: https://www.biostars.org/p/93245/ [hlatools]: https://www.biostars.org/p/93245/
[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
[hla3]: http://www.biomedcentral.com/1471-2164/15/325
[hla4]: http://genomemedicine.com/content/4/12/95

View File

@ -203,42 +203,18 @@ function parse_hit(s, opt)
return h; return h;
} }
function type_hla(w) function print_buffer(buf2, fp_hla, hla)
{ {
var hla = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]; if (buf2.length == 0) return;
var hla_hash = {}, a = [], r = []; for (var i = 0; i < buf2.length; ++i)
for (var i = 0; i < hla.length; ++i) { print(buf2[i].join("\t"));
hla_hash[hla[i]] = i; if (fp_hla != null) {
a[i] = []; var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
} for (var x in hla) {
for (var i = 0; i < w.length; ++i) { if (fp_hla[x] != null);
var t = w[i][0].split(/[:\*]/); fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
var x = hla_hash[t[0]];
if (x != null)
a[x].push([w[i][0], t[1], t[1] + ':' + t[2], w[i][1]]);
}
for (var k = 1; k <= 2; ++k) {
for (var i = 0; i < hla.length; ++i) {
var ai = a[i], m = {};
for (var j = 0; j < ai.length; ++j) {
var key = ai[j][k], val = ai[j][3];
if (m[key] == null) m[key] = [-1, -1.0];
if (m[key][1] < val) m[key] = [j, val];
}
var sum = 0;
for (var x in m) sum += m[x][1];
var max = -1, max2 = -1, max3 = -1, max_x, max_x2, max_x3;
for (var x in m) {
if (max < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = max, max_x2 = max_x, max = m[x][1], max_x = x;
else if (max2 < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = m[x][1], max_x2 = x;
else if (max3 < m[x][1]) max3 = m[x][1], max_x3 = x;
}
r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]);
} }
} }
for (var i = 0; i < r.length; ++i)
print(r[i].join("\t"));
return r;
} }
function bwa_postalt(args) function bwa_postalt(args)
@ -285,7 +261,7 @@ function bwa_postalt(args)
var buf = new Bytes(); var buf = new Bytes();
// read ALT-to-REF alignment // read ALT-to-REF alignment
var intv_alt = {}, intv_pri = {}, idx_un = {}; var intv_alt = {}, intv_pri = {}, idx_un = {}, hla_ctg = {};
var file = new File(args[getopt.ind]); var file = new File(args[getopt.ind]);
while (file.readline(buf) >= 0) { while (file.readline(buf) >= 0) {
var line = buf.toString(); var line = buf.toString();
@ -299,6 +275,10 @@ function bwa_postalt(args)
continue; continue;
} }
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
++hla_ctg[m[1]];
}
while ((m = re_cigar.exec(t[5])) != null) { while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]); var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
@ -321,6 +301,14 @@ function bwa_postalt(args)
for (var ctg in intv_pri) for (var ctg in intv_pri)
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
// initialize the list of HLA contigs
var fp_hla = null;
if (opt.pre) {
fp_hla = {};
for (var h in hla_ctg)
fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
}
// initialize the list of ALT contigs // initialize the list of ALT contigs
var weight_alt = []; var weight_alt = [];
for (var ctg in idx_alt) for (var ctg in idx_alt)
@ -329,7 +317,7 @@ function bwa_postalt(args)
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0]; weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
// process SAM // process SAM
var buf2 = []; var buf2 = [], hla = {};
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
while (file.readline(buf) >= 0) { while (file.readline(buf) >= 0) {
var m, line = buf.toString(); var m, line = buf.toString();
@ -343,9 +331,8 @@ function bwa_postalt(args)
// print bufferred reads // print bufferred reads
if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
for (var i = 0; i < buf2.length; ++i) print_buffer(buf2, fp_hla, hla);
print(buf2[i].join("\t")); buf2 = [], hla = {};
buf2 = [];
} }
// skip unmapped lines // skip unmapped lines
@ -468,27 +455,35 @@ function bwa_postalt(args)
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
} else mapQ = t[4]; } else mapQ = t[4];
var pri_ofunc = idx_pri[hits[reported_i].pctg], ovlp_alt = [];
if (pri_ofunc != null) {
var rpt_start = 1<<30, rpt_end = 0;
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
if (h.g == reported_g) {
rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
}
}
ovlp_alt = pri_ofunc(rpt_start, rpt_end);
for (var i = 0; i < ovlp_alt.length; ++i)
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
hla[m[1]] = true;
}
// ALT genotyping // ALT genotyping
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) { if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
// collect all overlapping ALT contigs // collect all overlapping ALT contigs
var hits2 = []; var alts = {};
for (var i = 0; i < hits.length; ++i) { for (var i = 0; i < hits.length; ++i) {
var h = hits[i]; var h = hits[i];
if (h.g == reported_g) if (h.g == reported_g && weight_alt[h.ctg] != null)
hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]); alts[h.ctg] = [h.score, h.NM];
} }
var start = hits2[0][1], end = hits2[0][2]; if (ovlp_alt.length > 0) { // add other unreported hits
for (var i = 1; i < hits2.length; ++i) for (var i = 0; i < ovlp_alt.length; ++i)
end = end > hits2[i][2]? end : hits2[i][2]; if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1] && alts[ovlp_alt[i][2]] == null)
var alts = {}; alts[ovlp_alt[i][2]] = [0, 0];
for (var i = 0; i < hits2.length; ++i)
if (weight_alt[hits2[i][3]] != null)
alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
var ovlp = idx_pri[hits2[0][0]](start, end);
for (var i = 0; i < ovlp.length; ++i)
if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
alts[ovlp[i][2]] = [0, 0];
} }
// add weight to each ALT contig // add weight to each ALT contig
@ -600,10 +595,12 @@ function bwa_postalt(args)
buf2.push(s); buf2.push(s);
} }
} }
for (var i = 0; i < buf2.length; ++i) print_buffer(buf2, fp_hla, hla);
print(buf2[i].join("\t"));
file.close(); file.close();
if (fp_evi != null) fp_evi.close(); if (fp_evi != null) fp_evi.close();
if (fp_hla != null)
for (var h in fp_hla)
fp_hla[h].close();
buf.destroy(); buf.destroy();
aux.destroy(); aux.destroy();
@ -628,12 +625,6 @@ function bwa_postalt(args)
fpout.write(weight_arr[i].join("\t") + '\n'); fpout.write(weight_arr[i].join("\t") + '\n');
} }
fpout.close(); fpout.close();
var r = type_hla(weight_hla);
fpout = new File(opt.pre + '.hla', "w");
for (var i = 0; i < r.length; ++i)
fpout.write(r[i].join("\t") + '\n');
fpout.close();
} }
} }