ALT genotyping semi-working, but ...

... the result is not good.
This commit is contained in:
Heng Li 2014-10-05 22:18:07 -04:00
parent 2c3619d569
commit 2d7ec6b051
1 changed files with 61 additions and 10 deletions

View File

@ -205,7 +205,7 @@ function parse_hit(s, opt)
function bwa_postalt(args)
{
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true };
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90 };
while ((c = getopt(args, 'pqv:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
@ -237,8 +237,9 @@ function bwa_postalt(args)
var file = new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var line = buf.toString();
var t = line.split("\t");
if (line.charAt(0) == '@') continue;
var t = line.split("\t");
if (t.length < 11) continue; // incomplete lines
var pos = parseInt(t[3]) - 1;
var flag = parseInt(t[1]);
if ((flag&4) || t[2] == '*') {
@ -249,14 +250,15 @@ function bwa_postalt(args)
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
if (m[2] == 'M' || m[2] == 'I') l_qaln += l, l_tlen += l;
if (m[2] == 'M') l_qaln += l, l_tlen += l;
else if (m[2] == 'I') l_qaln += l;
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
}
var j = flag&16? cigar.length-1 : 0;
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
}
@ -267,6 +269,13 @@ function bwa_postalt(args)
for (var ctg in intv_pri)
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
// initialize the list of ALT contigs
var weight_alt = [];
for (var ctg in idx_alt)
weight_alt[ctg] = [0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
for (var ctg in idx_un)
weight_alt[ctg] = [0, 0, 0, '~', 0, 0];
// process SAM
var buf2 = [];
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
@ -314,7 +323,7 @@ function bwa_postalt(args)
// check if there are ALT hits
var has_alt = false;
for (var i = 0; i < hits.length; ++i)
if (idx_alt[hits[i].ctg] != null || idx_un[hits[i].ctg]) {
if (weight_alt[hits[i].ctg] != null) {
has_alt = true;
break;
}
@ -380,7 +389,13 @@ function bwa_postalt(args)
for (var i = 0; i < hits.length; ++i)
if (hits[i].g == reported_g)
++n_group0;
} else reported_g = reported_i = 0, n_group0 = 1;
} else {
if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
buf2.push(t);
continue;
}
reported_g = reported_i = 0, n_group0 = 1;
}
// re-estimate mapping quality if necessary
var mapQ, ori_mapQ = t[4];
@ -402,7 +417,8 @@ function bwa_postalt(args)
} else mapQ = t[4];
// ALT genotyping
{
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
// collect all overlapping ALT contigs
var hits2 = [];
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
@ -414,14 +430,36 @@ function bwa_postalt(args)
end = end > hits2[i][2]? end : hits2[i][2];
var alts = {};
for (var i = 0; i < hits2.length; ++i)
if (idx_alt[hits2[i][3]] != null || idx_un[hits2[i][3]] != null)
if (weight_alt[hits2[i][3]] != null)
alts[hits2[i][3]] = hits2[i][4];
if (idx_pri[hits2[0][0]] != null) {
var ovlp = idx_pri[hits2[0][0]](start, end);
for (var i = 0; i < ovlp.length; ++i)
for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap
if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = 0;
}
print(obj2str(alts));
// add weight to each ALT contig
var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30;
for (var ctg in alts)
alt_arr.push([ctg, alts[ctg], 0]);
for (var i = 0; i < alt_arr.length; ++i) {
if (max_sc < alt_arr[i][1])
max_sc = alt_arr[i][1], max_i = i;
min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
}
if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
for (var i = 0; i < alt_arr.length; ++i) {
alt_arr[i][2] = Math.exp(.1 * (alt_arr[i][1] - max_sc));
//print('YYYYY', alt_arr[i][1], max_sc);
sum += alt_arr[i][2];
}
for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
for (var i = 0; i < alt_arr.length; ++i) {
var w = weight_alt[alt_arr[i][0]];
w[0] += max_sc, w[1] += max_sc * alt_arr[max_i][2], w[2] += max_sc * alt_arr[i][2];
//print('XXXXX', alt_arr[i][0], max_sc, max_sc * alt_arr[max_i][2], max_sc * alt_arr[i][2]);
}
}
}
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
@ -502,6 +540,19 @@ function bwa_postalt(args)
buf.destroy();
aux.destroy();
// print weight of each contig
var weight_arr = [];
for (var ctg in weight_alt) {
var w = weight_alt[ctg];
w[0] = w[0].toFixed(0), w[1] = w[1].toFixed(0), w[2] = w[2].toFixed(0);
weight_arr.push([ctg, w[0], w[1], w[2], w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4], w[5]]);
}
weight_arr.sort(function(a,b) {return a[5] < b[5]? -1 : a[5] > b[5]? 1 : a[6] - b[6]});
for (var i = 0; i < weight_arr.length; ++i) {
if (weight_arr[i][5] == '~') weight_arr[i][5] = '*';
warn(weight_arr[i].join("\t"));
}
}
bwa_postalt(arguments);