Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-25 23:10:25 -04:00
commit dbe74ca9b8
2 changed files with 61 additions and 65 deletions

View File

@ -156,14 +156,16 @@ have ignored these important genes.
We recommend to include the genomic regions of classical HLA genes in the BWA
index. This way we will be able to get a more complete collection of reads
mapped to HLA. We can then isolate these reads with little computational cost
and type HLA genes with another program, such as [Dilthey et al (2014)][hla1] or
one from [this list][hlatools].
and type HLA genes with another program, such as [Warren et al (2012)][hla4],
[Liu et al (2013)][hla2], [Bai et al (2014)][hla3], [Dilthey et al (2014)][hla1]
or others from [this list][hlatools].
If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it
will also write the top three alleles to file `prefix.hla`. However, as most HLA
alleles from IMGT/HLA don't have intronic sequences and thus are not included in
the reference genome, we are unable to type HLA genes to high resolution with
the BWA-MEM mapping alone. A dedicated tool is recommended for accurate typing.
the BWA index from option 2, we are unable to type HLA genes to high resolution
with the BWA-MEM mapping alone. A dedicated tool is recommended for accurate
typing.
### Evaluating ALT Mapping
@ -194,3 +196,6 @@ can even get rid of ALT contigs for good.
[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
[hlalink]: http://www.hladiseaseassociations.com
[hlatools]: https://www.biostars.org/p/93245/
[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
[hla3]: http://www.biomedcentral.com/1471-2164/15/325
[hla4]: http://genomemedicine.com/content/4/12/95

View File

@ -203,42 +203,18 @@ function parse_hit(s, opt)
return h;
}
function type_hla(w)
function print_buffer(buf2, fp_hla, hla)
{
var hla = ["A", "B", "C", "DQA1", "DQB1", "DRB1"];
var hla_hash = {}, a = [], r = [];
for (var i = 0; i < hla.length; ++i) {
hla_hash[hla[i]] = i;
a[i] = [];
}
for (var i = 0; i < w.length; ++i) {
var t = w[i][0].split(/[:\*]/);
var x = hla_hash[t[0]];
if (x != null)
a[x].push([w[i][0], t[1], t[1] + ':' + t[2], w[i][1]]);
}
for (var k = 1; k <= 2; ++k) {
for (var i = 0; i < hla.length; ++i) {
var ai = a[i], m = {};
for (var j = 0; j < ai.length; ++j) {
var key = ai[j][k], val = ai[j][3];
if (m[key] == null) m[key] = [-1, -1.0];
if (m[key][1] < val) m[key] = [j, val];
}
var sum = 0;
for (var x in m) sum += m[x][1];
var max = -1, max2 = -1, max3 = -1, max_x, max_x2, max_x3;
for (var x in m) {
if (max < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = max, max_x2 = max_x, max = m[x][1], max_x = x;
else if (max2 < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = m[x][1], max_x2 = x;
else if (max3 < m[x][1]) max3 = m[x][1], max_x3 = x;
}
r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]);
if (buf2.length == 0) return;
for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t"));
if (fp_hla != null) {
var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
for (var x in hla) {
if (fp_hla[x] != null);
fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
}
}
for (var i = 0; i < r.length; ++i)
print(r[i].join("\t"));
return r;
}
function bwa_postalt(args)
@ -285,7 +261,7 @@ function bwa_postalt(args)
var buf = new Bytes();
// read ALT-to-REF alignment
var intv_alt = {}, intv_pri = {}, idx_un = {};
var intv_alt = {}, intv_pri = {}, idx_un = {}, hla_ctg = {};
var file = new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var line = buf.toString();
@ -299,6 +275,10 @@ function bwa_postalt(args)
continue;
}
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
++hla_ctg[m[1]];
}
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
@ -321,6 +301,14 @@ function bwa_postalt(args)
for (var ctg in intv_pri)
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
// initialize the list of HLA contigs
var fp_hla = null;
if (opt.pre) {
fp_hla = {};
for (var h in hla_ctg)
fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
}
// initialize the list of ALT contigs
var weight_alt = [];
for (var ctg in idx_alt)
@ -329,7 +317,7 @@ function bwa_postalt(args)
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
// process SAM
var buf2 = [];
var buf2 = [], hla = {};
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
@ -343,9 +331,8 @@ function bwa_postalt(args)
// print bufferred reads
if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t"));
buf2 = [];
print_buffer(buf2, fp_hla, hla);
buf2 = [], hla = {};
}
// skip unmapped lines
@ -468,27 +455,35 @@ function bwa_postalt(args)
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
} else mapQ = t[4];
var pri_ofunc = idx_pri[hits[reported_i].pctg], ovlp_alt = [];
if (pri_ofunc != null) {
var rpt_start = 1<<30, rpt_end = 0;
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
if (h.g == reported_g) {
rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
}
}
ovlp_alt = pri_ofunc(rpt_start, rpt_end);
for (var i = 0; i < ovlp_alt.length; ++i)
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
hla[m[1]] = true;
}
// ALT genotyping
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
// collect all overlapping ALT contigs
var hits2 = [];
var alts = {};
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
if (h.g == reported_g)
hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]);
if (h.g == reported_g && weight_alt[h.ctg] != null)
alts[h.ctg] = [h.score, h.NM];
}
var start = hits2[0][1], end = hits2[0][2];
for (var i = 1; i < hits2.length; ++i)
end = end > hits2[i][2]? end : hits2[i][2];
var alts = {};
for (var i = 0; i < hits2.length; ++i)
if (weight_alt[hits2[i][3]] != null)
alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
var ovlp = idx_pri[hits2[0][0]](start, end);
for (var i = 0; i < ovlp.length; ++i)
if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
alts[ovlp[i][2]] = [0, 0];
if (ovlp_alt.length > 0) { // add other unreported hits
for (var i = 0; i < ovlp_alt.length; ++i)
if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1] && alts[ovlp_alt[i][2]] == null)
alts[ovlp_alt[i][2]] = [0, 0];
}
// add weight to each ALT contig
@ -600,10 +595,12 @@ function bwa_postalt(args)
buf2.push(s);
}
}
for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t"));
print_buffer(buf2, fp_hla, hla);
file.close();
if (fp_evi != null) fp_evi.close();
if (fp_hla != null)
for (var h in fp_hla)
fp_hla[h].close();
buf.destroy();
aux.destroy();
@ -628,12 +625,6 @@ function bwa_postalt(args)
fpout.write(weight_arr[i].join("\t") + '\n');
}
fpout.close();
var r = type_hla(weight_hla);
fpout = new File(opt.pre + '.hla', "w");
for (var i = 0; i < r.length; ++i)
fpout.write(r[i].join("\t") + '\n');
fpout.close();
}
}