From a03d01f944d2fd0ec5d967cac41065bd84db11d3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 30 Sep 2014 13:50:51 -0400 Subject: [PATCH 01/31] r878: XA is given to the best alignment Non-ALT hits may get ALT hits in the XA tag. This will simplify haplotype assignment. --- bwamem.c | 22 +++++++++++----------- bwamem.h | 2 +- bwamem_extra.c | 32 ++++++++++++++++---------------- main.c | 2 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6bbbc6e..7c54b25 100644 --- a/bwamem.c +++ b/bwamem.c @@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) { - int i, j, n_pri; + int i, n_pri; int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { - a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i); + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); for (i = 0; i < n; ++i) { mem_alnreg_t *p = &a[i]; - p->secondary_alt = i; // keep the rank in the first round + p->secondary_all = i; // keep the rank in the first round if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) p->alt_sc = a[p->secondary].score; } if (n_pri >= 0 && n_pri < n) { kv_resize(int, z, n); if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); - for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i; + for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i; for (i = 0; i < n; ++i) { - if (a[i].secondary < 0) { - a[i].secondary_alt = -1; - continue; - } - j = z.a[a[i].secondary]; - a[i].secondary_alt = a[j].is_alt? j : -1; - if (a[i].is_alt) a[i].secondary = INT_MAX; + if (a[i].secondary >= 0) { + a[i].secondary_all = z.a[a[i].secondary]; + if (a[i].is_alt) a[i].secondary = INT_MAX; + } else a[i].secondary_all = -1; } if (n_pri > 0) { // mark primary for hits to the primary assembly only for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; mem_mark_primary_se_core(opt, n_pri, a, &z); } + } else { + for (i = 0; i < n; ++i) + a[i].secondary_all = a[i].secondary; } free(z.a); return n_pri; diff --git a/bwamem.h b/bwamem.h index 6079e5a..ae53601 100644 --- a/bwamem.h +++ b/bwamem.h @@ -69,7 +69,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary - int secondary_alt; + int secondary_all; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; diff --git a/bwamem_extra.c b/bwamem_extra.c index eea5ac7..09faaa2 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, s->sam = str.s; } -static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2]) +static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i) { - int j = a[i].secondary, k = a[i].secondary_alt; - r[0] = r[1] = -1; - if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j; - if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k; + int k = a[i].secondary_all; + if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k; + return -1; } // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() { - int i, k, *cnt, tot, r[2]; + int i, k, r, *cnt, tot; kstring_t *aln = 0, str = {0,0,0}; - char **XA = 0; + char **XA = 0, *has_alt; cnt = calloc(a->n, sizeof(int)); + has_alt = calloc(a->n, 1); for (i = 0, tot = 0; i < a->n; ++i) { - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] >= 0) ++cnt[r[0]], ++tot; - if (r[1] >= 0) ++cnt[r[1]], ++tot; + r = get_pri_idx(opt->XA_drop_ratio, a->a, i); + if (r >= 0) { + ++cnt[r], ++tot; + if (a->a[i].is_alt) has_alt[r] = 1; + } } if (tot == 0) goto end_gen_alt; aln = calloc(a->n, sizeof(kstring_t)); for (i = 0; i < a->n; ++i) { mem_aln_t t; - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] < 0 && r[1] < 0) continue; - if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue; + if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue; + if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); str.l = 0; kputs(bns->anns[t.rid].name, &str); @@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac kputc(',', &str); kputw(t.NM, &str); kputc(';', &str); free(t.cigar); - if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]); - if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]); + kputsn(str.s, str.l, &aln[r]); } XA = calloc(a->n, sizeof(char*)); for (k = 0; k < a->n; ++k) XA[k] = aln[k].s; end_gen_alt: - free(cnt); free(aln); free(str.s); + free(has_alt); free(cnt); free(aln); free(str.s); return XA; } diff --git a/main.c b/main.c index 4b39d6e..b162ab0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r876-dirty" +#define PACKAGE_VERSION "0.7.10-r878-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From d802cfb7f59a561b2101f0a1f2cbd99f6f853f43 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 30 Sep 2014 20:50:26 -0400 Subject: [PATCH 02/31] code backup --- bwa-postalt.js | 75 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 5e6b7c7..c50ae26 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -52,7 +52,7 @@ var getopt = function(args, ostr) { return optopt; } -// print an object in a format similar to JSON. For debugging. +// print an object in a format similar to JSON. For debugging only. function obj2str(o) { if (typeof(o) != 'object') { @@ -206,34 +206,39 @@ function parse_hit(s, opt) // read the ALT-to-REF alignment and generate the index function read_ALT_sam(fn) { - var intv = {}; + var intv_alt = {}, intv_pri = {}; var file = new File(fn); var buf = new Bytes(); while (file.readline(buf) >= 0) { var line = buf.toString(); var t = line.split("\t"); if (line.charAt(0) == '@') continue; + var pos = parseInt(t[3]) - 1; var flag = parseInt(t[1]); - var m, cigar = [], l_qaln = 0, l_qclip = 0; + var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip - if (m[2] == 'M' || m[2] == 'I') l_qaln += l; + if (m[2] == 'M' || m[2] == 'I') l_qaln += l, l_tlen += l; else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; + else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; } var j = flag&16? cigar.length-1 : 0; var start = cigar[j][0] == 'S'? cigar[j][1] : 0; - if (intv[t[0]] == null) intv[t[0]] = []; - intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); - //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar); + if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; + intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); + if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; + intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); } buf.destroy(); file.close(); // create index for intervals on ALT contigs - var idx = {}; - for (var ctg in intv) - idx[ctg] = intv_ovlp(intv[ctg]); - return idx; + var idx_alt = {}, idx_pri = {}; + for (var ctg in intv_alt) + idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); + for (var ctg in intv_pri) + idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); + return [idx_alt, idx_pri]; } function bwa_postalt(args) @@ -264,8 +269,9 @@ function bwa_postalt(args) var file, buf = new Bytes(); var aux = new Bytes(); // used for reverse and reverse complement - var idx = read_ALT_sam(args[getopt.ind]); - var buf2 = []; + var idx_pair = read_ALT_sam(args[getopt.ind]); + var idx_alt = idx_pair[0], idx_pri = idx_pair[1]; + var buf2 = [], buf3 = []; // process SAM file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); @@ -286,6 +292,21 @@ function bwa_postalt(args) buf2 = []; } + // test primary and if so whether it overlaps with ALT regions + if (idx_pri[t[2]] != null) { + var start = t[3], end = start; + while ((m = re_cigar.exec(t[5])) != null) + if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') + end += parseInt(m[1]); + var ovlp = idx_pri[t[2]](start, end); + if (ovlp.length > 0) { + var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 1; + for (var i = 0; i < ovlp.length; ++i) + buf3.push([t[2], start, end, ovlp[i][2]]); + } + } + + // parse the XA tag if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { buf2.push(t); continue; @@ -309,12 +330,12 @@ function bwa_postalt(args) hits.push(parse_hit(XA_strs[i].split(","), opt)); // lift mapping positions to the primary assembly - var n_lifted = 0, n_rpt_lifted = 0; + var n_lifted = 0, n_rpt_lifted = 0, rpt_lifted = null; for (var i = 0; i < hits.length; ++i) { var h = hits[i]; - if (idx[h.ctg] == null) continue; - var a = idx[h.ctg](h.start, h.end); + if (idx_alt[h.ctg] == null) continue; + var a = idx_alt[h.ctg](h.start, h.end); if (a == null || a.length == 0) continue; // find the approximate position on the primary assembly @@ -333,6 +354,7 @@ function bwa_postalt(args) lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); if (i == 0) ++n_rpt_lifted; } + if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); if (lifted.length) ++n_lifted, hits[i].lifted = lifted; } if (n_lifted == 0) { @@ -384,7 +406,7 @@ function bwa_postalt(args) // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) { - var l = lifted[0]; + var l = rpt_lifted; for (var i = 0; i < buf2.length; ++i) { var s = buf2[i]; if (l[0] != s[2]) continue; // different chr @@ -423,6 +445,7 @@ function bwa_postalt(args) need_rev = true; } if (need_rev) { // reverse and reverse complement + aux.length = 0; aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); aux.set(t[10],0); aux.reverse(); rq = aux.toString(); } @@ -438,14 +461,16 @@ function bwa_postalt(args) for (var i = 0; i < hits.length; ++i) { if (opt.verbose >= 5) print(obj2str(hits[i])); if (hits[i].g != reported_g || i == reported_i) continue; - if (!opt.show_pri && idx[hits[i].ctg] == null) continue; - var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; - // update name - if (flag&0x40) s[0] += "/1"; - if (flag&0x80) s[0] += "/2"; - s[0] += "_" + (++cnt); - if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]); - else s.push(rs, rq); + if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue; + var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; + // print sequence/quality and set the rev flag + if (hits[i].rev == hits[reported_i].rev) { + s.push(t[9], t[10]); + s[1] = flag & 0x10; + } else { + s.push(rs, rq); + s[1] = (flag & 0x10) ^ 0x10; + } s.push("NM:i:" + hits[i].NM); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); buf2.push(s); From 7b62fbb4ba82eb68edb15d59f3d2c3af0a24e040 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 15:34:49 -0400 Subject: [PATCH 03/31] r880: bug in writing .ann file --- bntseq.c | 2 +- bwa-postalt.js | 31 +++++++++---------------------- main.c | 2 +- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/bntseq.c b/bntseq.c index 93cdba1..75e5813 100644 --- a/bntseq.c +++ b/bntseq.c @@ -226,7 +226,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ } p = bns->anns + bns->n_seqs; p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; diff --git a/bwa-postalt.js b/bwa-postalt.js index c50ae26..ab29c9e 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -289,30 +289,9 @@ function bwa_postalt(args) if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); - buf2 = []; + buf2 = []; buf3 = []; } - // test primary and if so whether it overlaps with ALT regions - if (idx_pri[t[2]] != null) { - var start = t[3], end = start; - while ((m = re_cigar.exec(t[5])) != null) - if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') - end += parseInt(m[1]); - var ovlp = idx_pri[t[2]](start, end); - if (ovlp.length > 0) { - var score = (m = /\tAS:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 1; - for (var i = 0; i < ovlp.length; ++i) - buf3.push([t[2], start, end, ovlp[i][2]]); - } - } - - // parse the XA tag - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { - buf2.push(t); - continue; - } - var XA_strs = m[1].split(";"); - // parse the reported hit var hits = []; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; @@ -322,6 +301,13 @@ function bwa_postalt(args) buf2.push(t); continue; } + + // parse the XA tag + if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { + buf2.push(t); + continue; + } + var XA_strs = m[1].split(";"); hits.push(h); // parse hits in the XA tag @@ -356,6 +342,7 @@ function bwa_postalt(args) } if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); if (lifted.length) ++n_lifted, hits[i].lifted = lifted; + buf3.push(hits[i]); } if (n_lifted == 0) { buf2.push(t); diff --git a/main.c b/main.c index b162ab0..a680b92 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r878-dirty" +#define PACKAGE_VERSION "0.7.10-r880-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 0a526c698a6df2db5672613bd95cadc4bc88295c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 16:23:55 -0400 Subject: [PATCH 04/31] code backup --- bwa-postalt.js | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index ab29c9e..0f4b57e 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -206,7 +206,7 @@ function parse_hit(s, opt) // read the ALT-to-REF alignment and generate the index function read_ALT_sam(fn) { - var intv_alt = {}, intv_pri = {}; + var intv_alt = {}; var file = new File(fn); var buf = new Bytes(); while (file.readline(buf) >= 0) { @@ -227,18 +227,14 @@ function read_ALT_sam(fn) var start = cigar[j][0] == 'S'? cigar[j][1] : 0; if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); - if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; - intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); } buf.destroy(); file.close(); // create index for intervals on ALT contigs - var idx_alt = {}, idx_pri = {}; + var idx_alt = {} for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); - for (var ctg in intv_pri) - idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); - return [idx_alt, idx_pri]; + return idx_alt; } function bwa_postalt(args) @@ -269,8 +265,7 @@ function bwa_postalt(args) var file, buf = new Bytes(); var aux = new Bytes(); // used for reverse and reverse complement - var idx_pair = read_ALT_sam(args[getopt.ind]); - var idx_alt = idx_pair[0], idx_pri = idx_pair[1]; + var idx_alt = read_ALT_sam(args[getopt.ind]); var buf2 = [], buf3 = []; // process SAM @@ -292,6 +287,13 @@ function bwa_postalt(args) buf2 = []; buf3 = []; } + // parse the XA tag + if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { + buf2.push(t); + continue; + } + var XA_strs = m[1].split(";"); + // parse the reported hit var hits = []; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; @@ -301,13 +303,6 @@ function bwa_postalt(args) buf2.push(t); continue; } - - // parse the XA tag - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { - buf2.push(t); - continue; - } - var XA_strs = m[1].split(";"); hits.push(h); // parse hits in the XA tag @@ -388,7 +383,8 @@ function bwa_postalt(args) mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]); } else mapQ = 0; mapQ = mapQ < 60? mapQ : 60; - mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; + if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ; + else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality @@ -453,10 +449,10 @@ function bwa_postalt(args) // print sequence/quality and set the rev flag if (hits[i].rev == hits[reported_i].rev) { s.push(t[9], t[10]); - s[1] = flag & 0x10; + s[1] = (flag & 0x10) | 0x800; } else { s.push(rs, rq); - s[1] = (flag & 0x10) ^ 0x10; + s[1] = ((flag & 0x10) ^ 0x10) | 0x800; } s.push("NM:i:" + hits[i].NM); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); From c4bdf1678dc45e0ce263a2eae3355e0927f13c3c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 17:11:37 -0400 Subject: [PATCH 05/31] code backup --- bwa-postalt.js | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 0f4b57e..ec3ed37 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -206,7 +206,7 @@ function parse_hit(s, opt) // read the ALT-to-REF alignment and generate the index function read_ALT_sam(fn) { - var intv_alt = {}; + var intv_alt = {}, idx_un = {}; var file = new File(fn); var buf = new Bytes(); while (file.readline(buf) >= 0) { @@ -215,6 +215,10 @@ function read_ALT_sam(fn) if (line.charAt(0) == '@') continue; var pos = parseInt(t[3]) - 1; var flag = parseInt(t[1]); + if ((flag&4) || t[2] == '*') { + idx_un[t[0]] = true; + continue; + } var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); @@ -231,10 +235,10 @@ function read_ALT_sam(fn) buf.destroy(); file.close(); // create index for intervals on ALT contigs - var idx_alt = {} + var idx_alt = {}; for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); - return idx_alt; + return [idx_alt, idx_un]; } function bwa_postalt(args) @@ -265,7 +269,8 @@ function bwa_postalt(args) var file, buf = new Bytes(); var aux = new Bytes(); // used for reverse and reverse complement - var idx_alt = read_ALT_sam(args[getopt.ind]); + var idx_pair = read_ALT_sam(args[getopt.ind]); + var idx_alt = idx_pair[0], idx_un = idx_pair[1]; var buf2 = [], buf3 = []; // process SAM @@ -282,17 +287,18 @@ function bwa_postalt(args) // print bufferred reads if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { + for (var i = 0; i < buf3.length; ++i) print(obj2str(buf3[i])); for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); buf2 = []; buf3 = []; } // parse the XA tag - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { + if ((m = /\tXA:Z:(\S+)/.exec(line)) == null && idx_un[t[2]] == null) { buf2.push(t); continue; } - var XA_strs = m[1].split(";"); + var XA_strs = m != null? m[1].split(";") : []; // parse the reported hit var hits = []; @@ -311,13 +317,14 @@ function bwa_postalt(args) hits.push(parse_hit(XA_strs[i].split(","), opt)); // lift mapping positions to the primary assembly - var n_lifted = 0, n_rpt_lifted = 0, rpt_lifted = null; + var n_rpt_lifted = 0, rpt_lifted = null; for (var i = 0; i < hits.length; ++i) { - var h = hits[i]; + var a, h = hits[i]; - if (idx_alt[h.ctg] == null) continue; - var a = idx_alt[h.ctg](h.start, h.end); - if (a == null || a.length == 0) continue; + if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0) { + if (idx_un[h.ctg] != null || idx_alt[h.ctg] != null) buf3.push(hits[i]); + continue; + } // find the approximate position on the primary assembly var lifted = []; @@ -336,13 +343,9 @@ function bwa_postalt(args) if (i == 0) ++n_rpt_lifted; } if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); - if (lifted.length) ++n_lifted, hits[i].lifted = lifted; + if (lifted.length) hits[i].lifted = lifted; buf3.push(hits[i]); } - if (n_lifted == 0) { - buf2.push(t); - continue; - } // group hits based on the lifted positions on the primary assembly for (var i = 0; i < hits.length; ++i) { // set keys for sorting @@ -459,6 +462,7 @@ function bwa_postalt(args) buf2.push(s); } } + for (var i = 0; i < buf3.length; ++i) print(obj2str(buf3[i])); for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); file.close(); From cd44ac48cd4744fab0cb750b17e53ead46cff7ef Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 18:04:36 -0400 Subject: [PATCH 06/31] code backup --- bwa-postalt.js | 82 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index ec3ed37..1989f66 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -271,7 +271,7 @@ function bwa_postalt(args) var aux = new Bytes(); // used for reverse and reverse complement var idx_pair = read_ALT_sam(args[getopt.ind]); var idx_alt = idx_pair[0], idx_un = idx_pair[1]; - var buf2 = [], buf3 = []; + var buf2 = []; // process SAM file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); @@ -287,44 +287,51 @@ function bwa_postalt(args) // print bufferred reads if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { - for (var i = 0; i < buf3.length; ++i) print(obj2str(buf3[i])); for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); - buf2 = []; buf3 = []; + buf2 = []; } - // parse the XA tag - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null && idx_un[t[2]] == null) { + // skip unmapped lines + if (t[1]&4) { buf2.push(t); continue; } - var XA_strs = m != null? m[1].split(";") : []; // parse the reported hit - var hits = []; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; var flag = t[1]; var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); - if (h.hard) { // don't process lines with hard clips + if (h.hard) { // the following does not work with hard clipped alignments buf2.push(t); continue; } - hits.push(h); + var hits = [h]; // parse hits in the XA tag - for (var i = 0; i < XA_strs.length; ++i) - if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string - hits.push(parse_hit(XA_strs[i].split(","), opt)); + if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) { + var XA_strs = m[1].split(";"); + for (var i = 0; i < XA_strs.length; ++i) + if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string + hits.push(parse_hit(XA_strs[i].split(","), opt)); + } + + // check if there are ALT hits + var has_alt = false; + for (var i = 0; i < hits.length; ++i) + if (idx_alt[hits[i].ctg] != null || idx_un[hits[i].ctg]) { + has_alt = true; + break; + } + if (!has_alt) continue; // lift mapping positions to the primary assembly var n_rpt_lifted = 0, rpt_lifted = null; for (var i = 0; i < hits.length; ++i) { var a, h = hits[i]; - if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0) { - if (idx_un[h.ctg] != null || idx_alt[h.ctg] != null) buf3.push(hits[i]); + if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0) continue; - } // find the approximate position on the primary assembly var lifted = []; @@ -344,32 +351,38 @@ function bwa_postalt(args) } if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); if (lifted.length) hits[i].lifted = lifted; - buf3.push(hits[i]); } - // group hits based on the lifted positions on the primary assembly + // prepare for hits grouping for (var i = 0; i < hits.length; ++i) { // set keys for sorting - if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used + if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end; hits[i].i = i; // keep the original index } - hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); - var last_chr = null, end = 0, g = -1; - for (var i = 0; i < hits.length; ++i) { - if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; - else if (hits[i].pstart >= end) ++g; - hits[i].g = g; - end = end > hits[i].pend? end : hits[i].pend; - } - var reported_g = null, reported_i = null; - for (var i = 0; i < hits.length; ++i) - if (hits[i].i == 0) - reported_g = hits[i].g, reported_i = i; - var n_group0 = 0; // #hits overlapping the reported hit - for (var i = 0; i < hits.length; ++i) - if (hits[i].g == reported_g) - ++n_group0; + + // group hits based on the lifted positions on non-ALT sequences + if (hits.length > 1) { + hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); + var last_chr = null, end = 0, g = -1; + for (var i = 0; i < hits.length; ++i) { + if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; + else if (hits[i].pstart >= end) ++g; + hits[i].g = g; + end = end > hits[i].pend? end : hits[i].pend; + } + } else hits[0].g = 0; + + // find the index and group id of the reported hit; find the size of the reported group + var reported_g = null, reported_i = null, n_group0 = 0; + if (hits.length > 1) { + for (var i = 0; i < hits.length; ++i) + if (hits[i].i == 0) + reported_g = hits[i].g, reported_i = i; + for (var i = 0; i < hits.length; ++i) + if (hits[i].g == reported_g) + ++n_group0; + } else reported_g = reported_i = 0, n_group0 = 1; // re-estimate mapping quality if necessary var mapQ, ori_mapQ = t[4]; @@ -462,7 +475,6 @@ function bwa_postalt(args) buf2.push(s); } } - for (var i = 0; i < buf3.length; ++i) print(obj2str(buf3[i])); for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); file.close(); From 7026b3ec68db1abf3782a69be9ee754d7a7cc854 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 21:49:29 -0400 Subject: [PATCH 07/31] code backup --- bwa-postalt.js | 86 ++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 1989f66..2598e84 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -203,44 +203,6 @@ function parse_hit(s, opt) return h; } -// read the ALT-to-REF alignment and generate the index -function read_ALT_sam(fn) -{ - var intv_alt = {}, idx_un = {}; - var file = new File(fn); - var buf = new Bytes(); - while (file.readline(buf) >= 0) { - var line = buf.toString(); - var t = line.split("\t"); - if (line.charAt(0) == '@') continue; - var pos = parseInt(t[3]) - 1; - var flag = parseInt(t[1]); - if ((flag&4) || t[2] == '*') { - idx_un[t[0]] = true; - continue; - } - var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; - while ((m = re_cigar.exec(t[5])) != null) { - var l = parseInt(m[1]); - cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip - if (m[2] == 'M' || m[2] == 'I') l_qaln += l, l_tlen += l; - else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; - else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; - } - var j = flag&16? cigar.length-1 : 0; - var start = cigar[j][0] == 'S'? cigar[j][1] : 0; - if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; - intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); - } - buf.destroy(); - file.close(); - // create index for intervals on ALT contigs - var idx_alt = {}; - for (var ctg in intv_alt) - idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); - return [idx_alt, idx_un]; -} - function bwa_postalt(args) { var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true }; @@ -267,13 +229,46 @@ function bwa_postalt(args) exit(1); } - var file, buf = new Bytes(); var aux = new Bytes(); // used for reverse and reverse complement - var idx_pair = read_ALT_sam(args[getopt.ind]); - var idx_alt = idx_pair[0], idx_un = idx_pair[1]; - var buf2 = []; + var buf = new Bytes(); + + // read ALT-to-REF alignment + var intv_alt = {}, intv_pri = {}, idx_un = {}; + var file = new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var line = buf.toString(); + var t = line.split("\t"); + if (line.charAt(0) == '@') continue; + var pos = parseInt(t[3]) - 1; + var flag = parseInt(t[1]); + if ((flag&4) || t[2] == '*') { + idx_un[t[0]] = true; + continue; + } + var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; + while ((m = re_cigar.exec(t[5])) != null) { + var l = parseInt(m[1]); + cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip + if (m[2] == 'M' || m[2] == 'I') l_qaln += l, l_tlen += l; + else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; + else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; + } + var j = flag&16? cigar.length-1 : 0; + var start = cigar[j][0] == 'S'? cigar[j][1] : 0; + if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; + intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); + if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; + intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); + } + file.close(); + var idx_alt = {}, idx_pri = {}; + for (var ctg in intv_alt) + idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); + for (var ctg in intv_pri) + idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); // process SAM + var buf2 = []; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); while (file.readline(buf) >= 0) { var m, line = buf.toString(); @@ -323,7 +318,10 @@ function bwa_postalt(args) has_alt = true; break; } - if (!has_alt) continue; + if (!has_alt) { + buf2.push(t); + continue; + } // lift mapping positions to the primary assembly var n_rpt_lifted = 0, rpt_lifted = null; @@ -479,8 +477,8 @@ function bwa_postalt(args) print(buf2[i].join("\t")); file.close(); - aux.destroy(); buf.destroy(); + aux.destroy(); } bwa_postalt(arguments); From 2c3619d5698cc261ec1a22db20f01e2af9e8b3ea Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 2 Oct 2014 23:48:03 -0400 Subject: [PATCH 08/31] code backup --- bwa-postalt.js | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/bwa-postalt.js b/bwa-postalt.js index 2598e84..d56835f 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -401,6 +401,29 @@ function bwa_postalt(args) else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; + // ALT genotyping + { + var hits2 = []; + for (var i = 0; i < hits.length; ++i) { + var h = hits[i]; + if (h.g == reported_g) + hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score]); + } + var start = hits2[0][1], end = hits2[0][2]; + for (var i = 1; i < hits2.length; ++i) + end = end > hits2[i][2]? end : hits2[i][2]; + var alts = {}; + for (var i = 0; i < hits2.length; ++i) + if (idx_alt[hits2[i][3]] != null || idx_un[hits2[i][3]] != null) + alts[hits2[i][3]] = hits2[i][4]; + if (idx_pri[hits2[0][0]] != null) { + var ovlp = idx_pri[hits2[0][0]](start, end); + for (var i = 0; i < ovlp.length; ++i) + if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = 0; + } + print(obj2str(alts)); + } + // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) { var l = rpt_lifted; From 2d7ec6b0510949ae1fe16cf8fb7d9c9238d3108c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 5 Oct 2014 22:18:07 -0400 Subject: [PATCH 09/31] ALT genotyping semi-working, but ... ... the result is not good. --- bwa-postalt.js | 71 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index d56835f..f9e7223 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,7 +205,7 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90 }; while ((c = getopt(args, 'pqv:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); @@ -237,8 +237,9 @@ function bwa_postalt(args) var file = new File(args[getopt.ind]); while (file.readline(buf) >= 0) { var line = buf.toString(); - var t = line.split("\t"); if (line.charAt(0) == '@') continue; + var t = line.split("\t"); + if (t.length < 11) continue; // incomplete lines var pos = parseInt(t[3]) - 1; var flag = parseInt(t[1]); if ((flag&4) || t[2] == '*') { @@ -249,14 +250,15 @@ function bwa_postalt(args) while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip - if (m[2] == 'M' || m[2] == 'I') l_qaln += l, l_tlen += l; + if (m[2] == 'M') l_qaln += l, l_tlen += l; + else if (m[2] == 'I') l_qaln += l; else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; } var j = flag&16? cigar.length-1 : 0; var start = cigar[j][0] == 'S'? cigar[j][1] : 0; if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; - intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); + intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]); if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); } @@ -267,6 +269,13 @@ function bwa_postalt(args) for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); + // initialize the list of ALT contigs + var weight_alt = []; + for (var ctg in idx_alt) + weight_alt[ctg] = [0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]]; + for (var ctg in idx_un) + weight_alt[ctg] = [0, 0, 0, '~', 0, 0]; + // process SAM var buf2 = []; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); @@ -314,7 +323,7 @@ function bwa_postalt(args) // check if there are ALT hits var has_alt = false; for (var i = 0; i < hits.length; ++i) - if (idx_alt[hits[i].ctg] != null || idx_un[hits[i].ctg]) { + if (weight_alt[hits[i].ctg] != null) { has_alt = true; break; } @@ -380,7 +389,13 @@ function bwa_postalt(args) for (var i = 0; i < hits.length; ++i) if (hits[i].g == reported_g) ++n_group0; - } else reported_g = reported_i = 0, n_group0 = 1; + } else { + if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT + buf2.push(t); + continue; + } + reported_g = reported_i = 0, n_group0 = 1; + } // re-estimate mapping quality if necessary var mapQ, ori_mapQ = t[4]; @@ -402,7 +417,8 @@ function bwa_postalt(args) } else mapQ = t[4]; // ALT genotyping - { + if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) { + // collect all overlapping ALT contigs var hits2 = []; for (var i = 0; i < hits.length; ++i) { var h = hits[i]; @@ -414,14 +430,36 @@ function bwa_postalt(args) end = end > hits2[i][2]? end : hits2[i][2]; var alts = {}; for (var i = 0; i < hits2.length; ++i) - if (idx_alt[hits2[i][3]] != null || idx_un[hits2[i][3]] != null) + if (weight_alt[hits2[i][3]] != null) alts[hits2[i][3]] = hits2[i][4]; if (idx_pri[hits2[0][0]] != null) { var ovlp = idx_pri[hits2[0][0]](start, end); - for (var i = 0; i < ovlp.length; ++i) + for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = 0; } - print(obj2str(alts)); + + // add weight to each ALT contig + var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30; + for (var ctg in alts) + alt_arr.push([ctg, alts[ctg], 0]); + for (var i = 0; i < alt_arr.length; ++i) { + if (max_sc < alt_arr[i][1]) + max_sc = alt_arr[i][1], max_i = i; + min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1]; + } + if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) { + for (var i = 0; i < alt_arr.length; ++i) { + alt_arr[i][2] = Math.exp(.1 * (alt_arr[i][1] - max_sc)); + //print('YYYYY', alt_arr[i][1], max_sc); + sum += alt_arr[i][2]; + } + for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; + for (var i = 0; i < alt_arr.length; ++i) { + var w = weight_alt[alt_arr[i][0]]; + w[0] += max_sc, w[1] += max_sc * alt_arr[max_i][2], w[2] += max_sc * alt_arr[i][2]; + //print('XXXXX', alt_arr[i][0], max_sc, max_sc * alt_arr[max_i][2], max_sc * alt_arr[i][2]); + } + } } // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality @@ -502,6 +540,19 @@ function bwa_postalt(args) buf.destroy(); aux.destroy(); + + // print weight of each contig + var weight_arr = []; + for (var ctg in weight_alt) { + var w = weight_alt[ctg]; + w[0] = w[0].toFixed(0), w[1] = w[1].toFixed(0), w[2] = w[2].toFixed(0); + weight_arr.push([ctg, w[0], w[1], w[2], w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4], w[5]]); + } + weight_arr.sort(function(a,b) {return a[5] < b[5]? -1 : a[5] > b[5]? 1 : a[6] - b[6]}); + for (var i = 0; i < weight_arr.length; ++i) { + if (weight_arr[i][5] == '~') weight_arr[i][5] = '*'; + warn(weight_arr[i].join("\t")); + } } bwa_postalt(arguments); From 4356fd485cddbda9068e2631c9bab8257f03b34a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 10 Oct 2014 13:41:53 -0400 Subject: [PATCH 10/31] changed weighting --- bwa-postalt.js | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index f9e7223..cde1639 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,7 +205,7 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90 }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100 }; while ((c = getopt(args, 'pqv:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); @@ -423,7 +423,7 @@ function bwa_postalt(args) for (var i = 0; i < hits.length; ++i) { var h = hits[i]; if (h.g == reported_g) - hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score]); + hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]); } var start = hits2[0][1], end = hits2[0][2]; for (var i = 1; i < hits2.length; ++i) @@ -431,33 +431,32 @@ function bwa_postalt(args) var alts = {}; for (var i = 0; i < hits2.length; ++i) if (weight_alt[hits2[i][3]] != null) - alts[hits2[i][3]] = hits2[i][4]; + alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]]; if (idx_pri[hits2[0][0]] != null) { var ovlp = idx_pri[hits2[0][0]](start, end); for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap - if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = 0; + if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = [0, 0]; } // add weight to each ALT contig - var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30; + var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1; for (var ctg in alts) - alt_arr.push([ctg, alts[ctg], 0]); + alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]); for (var i = 0; i < alt_arr.length; ++i) { if (max_sc < alt_arr[i][1]) max_sc = alt_arr[i][1], max_i = i; min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1]; + var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc; + max_nm = max_nm > nm? max_nm : nm; } + if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc; if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) { - for (var i = 0; i < alt_arr.length; ++i) { - alt_arr[i][2] = Math.exp(.1 * (alt_arr[i][1] - max_sc)); - //print('YYYYY', alt_arr[i][1], max_sc); - sum += alt_arr[i][2]; - } + for (var i = 0; i < alt_arr.length; ++i) + sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc))); for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; for (var i = 0; i < alt_arr.length; ++i) { var w = weight_alt[alt_arr[i][0]]; - w[0] += max_sc, w[1] += max_sc * alt_arr[max_i][2], w[2] += max_sc * alt_arr[i][2]; - //print('XXXXX', alt_arr[i][0], max_sc, max_sc * alt_arr[max_i][2], max_sc * alt_arr[i][2]); + w[0] += max_nm, w[1] += max_nm * alt_arr[max_i][2], w[2] += max_nm * alt_arr[i][2]; } } } @@ -548,7 +547,9 @@ function bwa_postalt(args) w[0] = w[0].toFixed(0), w[1] = w[1].toFixed(0), w[2] = w[2].toFixed(0); weight_arr.push([ctg, w[0], w[1], w[2], w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4], w[5]]); } - weight_arr.sort(function(a,b) {return a[5] < b[5]? -1 : a[5] > b[5]? 1 : a[6] - b[6]}); + weight_arr.sort(function(a,b) { + return a[5] < b[5]? -1 : a[5] > b[5]? 1 : a[6] != b[6]? a[6] - b[6] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; + }); for (var i = 0; i < weight_arr.length; ++i) { if (weight_arr[i][5] == '~') weight_arr[i][5] = '*'; warn(weight_arr[i].join("\t")); From bf08f76a3c46373a30838f2ba91c8811584616be Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 12 Oct 2014 17:17:29 -0400 Subject: [PATCH 11/31] requiring full overlap --- bwa-postalt.js | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index cde1639..0b6c086 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,12 +205,14 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100 }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100, show_ev:false, wei1:false }; - while ((c = getopt(args, 'pqv:')) != null) { + while ((c = getopt(args, '1pqev:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); else if (c == 'p') opt.show_pri = true; else if (c == 'q') opt.recover_maq = false; + else if (c == 'e') opt.show_ev = true; + else if (c == '1') opt.wei1 = true; } if (args.length == getopt.ind) { @@ -435,7 +437,8 @@ function bwa_postalt(args) if (idx_pri[hits2[0][0]] != null) { var ovlp = idx_pri[hits2[0][0]](start, end); for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap - if (alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = [0, 0]; + if (start <= ovlp[i][0] && ovlp[i][1] <= end && alts[ovlp[i][2]] == null) + alts[ovlp[i][2]] = [0, 0]; } // add weight to each ALT contig @@ -455,8 +458,11 @@ function bwa_postalt(args) sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc))); for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; for (var i = 0; i < alt_arr.length; ++i) { - var w = weight_alt[alt_arr[i][0]]; - w[0] += max_nm, w[1] += max_nm * alt_arr[max_i][2], w[2] += max_nm * alt_arr[i][2]; + if (opt.wei1) max_nm = 1; + var e = [alt_arr[i][0], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]]; + var w = weight_alt[e[0]]; + w[0] += e[1], w[1] += e[2], w[2] += e[3]; + if (opt.show_ev) warn(t[0] + '/' + (t[1]>>6&3), e.join("\t")); } } } @@ -544,7 +550,7 @@ function bwa_postalt(args) var weight_arr = []; for (var ctg in weight_alt) { var w = weight_alt[ctg]; - w[0] = w[0].toFixed(0), w[1] = w[1].toFixed(0), w[2] = w[2].toFixed(0); + w[0] = w[0].toFixed(4), w[1] = w[1].toFixed(4), w[2] = w[2].toFixed(4); weight_arr.push([ctg, w[0], w[1], w[2], w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4], w[5]]); } weight_arr.sort(function(a,b) { From 2485a3ca0276b95a62899b0b4d22e6fa35f3f111 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 12 Oct 2014 18:24:40 -0400 Subject: [PATCH 12/31] bug in the last commit --- bwa-postalt.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 0b6c086..2d64193 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -437,7 +437,7 @@ function bwa_postalt(args) if (idx_pri[hits2[0][0]] != null) { var ovlp = idx_pri[hits2[0][0]](start, end); for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap - if (start <= ovlp[i][0] && ovlp[i][1] <= end && alts[ovlp[i][2]] == null) + if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = [0, 0]; } From df20911110bc5d537b018a6299fd891d93f07028 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 14:49:53 -0400 Subject: [PATCH 13/31] r890: bns_intv2rid() may wrongly return -1 --- bntseq.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bntseq.c b/bntseq.c index 75e5813..6589434 100644 --- a/bntseq.c +++ b/bntseq.c @@ -358,8 +358,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) { int is_rev, rid_b, rid_e; if (rb < bns->l_pac && re > bns->l_pac) return -2; + assert(rb <= re); rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); - rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); + rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b; return rid_b == rid_e? rid_b : -1; } diff --git a/main.c b/main.c index a680b92..5a2af4c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r880-dirty" +#define PACKAGE_VERSION "0.7.10-r890-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 15becdd3df2d9ffcc65fe84a9d0c4c6de2fc94ea Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 14:50:37 -0400 Subject: [PATCH 14/31] use weight1 by default --- bwa-postalt.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 2d64193..d443254 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,14 +205,14 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100, show_ev:false, wei1:false }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100, show_ev:false, wei1:true }; - while ((c = getopt(args, '1pqev:')) != null) { + while ((c = getopt(args, 'wpqev:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); else if (c == 'p') opt.show_pri = true; else if (c == 'q') opt.recover_maq = false; else if (c == 'e') opt.show_ev = true; - else if (c == '1') opt.wei1 = true; + else if (c == 'w') opt.wei1 = false; } if (args.length == getopt.ind) { @@ -434,9 +434,9 @@ function bwa_postalt(args) for (var i = 0; i < hits2.length; ++i) if (weight_alt[hits2[i][3]] != null) alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]]; - if (idx_pri[hits2[0][0]] != null) { + if (idx_pri[hits2[0][0]] != null) { // add other unreported hits var ovlp = idx_pri[hits2[0][0]](start, end); - for (var i = 0; i < ovlp.length; ++i) // TODO: requiring reasonable overlap + for (var i = 0; i < ovlp.length; ++i) if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null) alts[ovlp[i][2]] = [0, 0]; } From c79427a22d126b3e7e157b37ec09d1afcd3885d4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 16:31:40 -0400 Subject: [PATCH 15/31] improved CLI --- bwa-postalt.js | 68 +++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index d443254..d4d959e 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -205,21 +205,29 @@ function parse_hit(s, opt) function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:100, show_ev:false, wei1:true }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false }; - while ((c = getopt(args, 'wpqev:')) != null) { + while ((c = getopt(args, 'Pqev:p:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); - else if (c == 'p') opt.show_pri = true; + else if (c == 'p') opt.pre = getopt.arg; + else if (c == 'P') opt.show_pri = true; else if (c == 'q') opt.recover_maq = false; else if (c == 'e') opt.show_ev = true; - else if (c == 'w') opt.wei1 = false; + } + + if (opt.show_ev && opt.pre == null) { + warn("ERROR: option '-p' must be specified if '-e' is applied."); + exit(1); } if (args.length == getopt.ind) { print(""); - print("Usage: k8 bwa-postalt.js [-p] [aln.sam]\n"); - print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)"); - print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT"); + print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n"); + print("Options: -p STR prefix of file(s) for additional information [null]"); + print(" PREFIX.ctw - weight of each ALT contig"); + print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)"); + print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT"); + print(" -e show reads supporting ALT contigs into file PREFIX.evi"); print(""); print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to"); print(" the primary assembly, groups them and then estimates mapQ across groups. If"); @@ -231,6 +239,7 @@ function bwa_postalt(args) exit(1); } + var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null; var aux = new Bytes(); // used for reverse and reverse complement var buf = new Bytes(); @@ -274,9 +283,9 @@ function bwa_postalt(args) // initialize the list of ALT contigs var weight_alt = []; for (var ctg in idx_alt) - weight_alt[ctg] = [0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]]; + weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]]; for (var ctg in idx_un) - weight_alt[ctg] = [0, 0, 0, '~', 0, 0]; + weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0]; // process SAM var buf2 = []; @@ -458,11 +467,14 @@ function bwa_postalt(args) sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc))); for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; for (var i = 0; i < alt_arr.length; ++i) { - if (opt.wei1) max_nm = 1; - var e = [alt_arr[i][0], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]]; + var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]]; var w = weight_alt[e[0]]; - w[0] += e[1], w[1] += e[2], w[2] += e[3]; - if (opt.show_ev) warn(t[0] + '/' + (t[1]>>6&3), e.join("\t")); + for (var j = 0; j < 6; ++j) w[j] += e[j+1]; + if (fp_evi) { + e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3); + e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3); + fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n'); + } } } } @@ -542,23 +554,29 @@ function bwa_postalt(args) for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); file.close(); + if (fp_evi != null) fp_evi.close(); buf.destroy(); aux.destroy(); // print weight of each contig - var weight_arr = []; - for (var ctg in weight_alt) { - var w = weight_alt[ctg]; - w[0] = w[0].toFixed(4), w[1] = w[1].toFixed(4), w[2] = w[2].toFixed(4); - weight_arr.push([ctg, w[0], w[1], w[2], w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4], w[5]]); - } - weight_arr.sort(function(a,b) { - return a[5] < b[5]? -1 : a[5] > b[5]? 1 : a[6] != b[6]? a[6] - b[6] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; - }); - for (var i = 0; i < weight_arr.length; ++i) { - if (weight_arr[i][5] == '~') weight_arr[i][5] = '*'; - warn(weight_arr[i].join("\t")); + if (opt.pre != null) { + var fpout = new File(opt.pre + '.ctw', "w"); + var weight_arr = []; + for (var ctg in weight_alt) { + var w = weight_alt[ctg]; + weight_arr.push([ctg, w[6], w[7], w[8], + w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', + w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']); + } + weight_arr.sort(function(a,b) { + return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; + }); + for (var i = 0; i < weight_arr.length; ++i) { + if (weight_arr[i][1] == '~') weight_arr[i][1] = '*'; + fpout.write(weight_arr[i].join("\t") + '\n'); + } + fpout.close(); } } From 647b0e828a1297c2559c25c12cb948ca0a2481bb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 16:57:57 -0400 Subject: [PATCH 16/31] wrong flag in the SAM output --- bwa-postalt.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index d4d959e..411cfdd 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -541,10 +541,10 @@ function bwa_postalt(args) // print sequence/quality and set the rev flag if (hits[i].rev == hits[reported_i].rev) { s.push(t[9], t[10]); - s[1] = (flag & 0x10) | 0x800; + s[1] = flag | 0x800; } else { s.push(rs, rq); - s[1] = ((flag & 0x10) ^ 0x10) | 0x800; + s[1] = (flag ^ 0x10) | 0x800; } s.push("NM:i:" + hits[i].NM); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); From 2a18fa114fbd78c018ccc343898c789c5ce86793 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 16:58:42 -0400 Subject: [PATCH 17/31] r895: increase the default max_XA_hits_alt to 200 Because there are >100 HLA haplotypes --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7c54b25..79785f4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_XA_hits = 5; - o->max_XA_hits_alt = 50; + o->max_XA_hits_alt = 200; o->max_matesw = 50; o->mask_level_redun = 0.95; o->min_chain_weight = 0; diff --git a/main.c b/main.c index 5a2af4c..db3c34f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r890-dirty" +#define PACKAGE_VERSION "0.7.10-r895-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 71277f0fea035b580877bae63431ef1aff5e663f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 23:37:24 -0400 Subject: [PATCH 18/31] r896: more flexible ALT reading --- bntseq.c | 18 +++++++++++++----- bwa.c | 5 +++++ main.c | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/bntseq.c b/bntseq.c index 6589434..465e383 100644 --- a/bntseq.c +++ b/bntseq.c @@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix) if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present char str[1024]; khash_t(str) *h; - int i, absent; + int c, i, absent; khint_t k; h = kh_init(str); for (i = 0; i < bns->n_seqs; ++i) { k = kh_put(str, h, bns->anns[i].name, &absent); kh_val(h, k) = i; } - while (fscanf(fp, "%s", str) == 1) { - k = kh_get(str, h, str); - if (k != kh_end(h)) - bns->anns[kh_val(h, k)].is_alt = 1; + i = 0; + while ((c = fgetc(fp)) != EOF) { + if (c == '\t' || c == '\n' || c == '\r') { + str[i] = 0; + if (str[0] != '@') { + k = kh_get(str, h, str); + if (k != kh_end(h)) + bns->anns[kh_val(h, k)].is_alt = 1; + } + while (c != '\n' && c != EOF) c = fgetc(fp); + i = 0; + } else str[i++] = c; // FIXME: potential segfault here } kh_destroy(str, h); fclose(fp); diff --git a/bwa.c b/bwa.c index 43d8a58..495b197 100644 --- a/bwa.c +++ b/bwa.c @@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { + int i, c; idx->bns = bns_restore(prefix); + for (i = c = 0; i < idx->bns->n_seqs; ++i) + if (idx->bns->anns[i].is_alt) ++c; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c); if (which & BWA_IDX_PAC) { idx->pac = calloc(idx->bns->l_pac/4+1, 1); err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence diff --git a/main.c b/main.c index db3c34f..77cace1 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r895-dirty" +#define PACKAGE_VERSION "0.7.10-r896-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From c44357926f79fa3aca409779d13aa9f957989442 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 14 Oct 2014 23:46:59 -0400 Subject: [PATCH 19/31] updated README, a bit --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3472e91..250a009 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future. 3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) 4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) 5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) -6. [How to map sequences to GRCh38 with ALT contigs?](#h38) ####1. What types of data does BWA work with? @@ -93,6 +92,10 @@ settings: bwa mem -x pacbio ref.fa reads.fq > aln.sam +* Oxford Nanopore reads to a reference genome: + + bwa mem -x ont2d ref.fa reads.fq > aln.sam + BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small. From c5e859b49f9e999a64a511585e06001d4e6c07b0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 15 Oct 2014 12:27:45 -0400 Subject: [PATCH 20/31] r898: read the index into a single memory block Prepare for shared memory. Not used now. --- README.md | 15 +++++------ bwa.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- bwa.h | 5 ++++ main.c | 2 +- 4 files changed, 83 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 250a009..cfc84c6 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended settings: * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly - contigs up to a few megabases mapped to a close related reference genome: + contigs up to a few megabases mapped to a closely related reference genome: bwa mem ref.fa reads.fq > aln.sam -* Illumina single-end reads no longer than ~70bp: +* Illumina single-end reads shorter than ~70bp: bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam @@ -83,24 +83,21 @@ settings: bwa mem ref.fa read1.fq read2.fq > aln-pe.sam -* Illumina paired-end reads no longer than ~70bp: +* Illumina paired-end reads shorter than ~70bp: bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam -* PacBio subreads to a reference genome: +* PacBio subreads or Oxford Nanopore reads to a reference genome: bwa mem -x pacbio ref.fa reads.fq > aln.sam - -* Oxford Nanopore reads to a reference genome: - bwa mem -x ont2d ref.fa reads.fq > aln.sam BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small. -As is shown above, with non-default settings, BWA-MEM works with PacBio subreads -with a sequencing error rate as high as ~15%. +As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore +reads with a sequencing error rate over 20%. ####2. Why does a read appear multiple times in the output SAM? diff --git a/bwa.c b/bwa.c index 495b197..e2e1376 100644 --- a/bwa.c +++ b/bwa.c @@ -259,12 +259,80 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) void bwa_idx_destroy(bwaidx_t *idx) { if (idx == 0) return; - if (idx->bwt) bwt_destroy(idx->bwt); - if (idx->bns) bns_destroy(idx->bns); - if (idx->pac) free(idx->pac); + if (idx->mem == 0) { + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + } else free((uint8_t*)idx->mem); free(idx); } +int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx) +{ + int64_t k = 0, x; + int i; + + // generate idx->bwt + x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; + x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; + + // generate idx->bns and idx->pac + x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; + x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; + idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; + } + idx->pac = (uint8_t*)(mem + k); + + idx->l_mem = k; idx->mem = mem; + return 0; +} + +int bwa_idx2mem(bwaidx_t *idx) +{ + int i; + int64_t k = 0, x, tmp; + uint8_t *mem = 0; + + // copy idx->bwt + mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t)); + x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x; + free(idx->bwt->sa); + mem = realloc(mem, k + idx->bwt->bwt_size * 4); + x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x; + free(idx->bwt->bwt); + free(idx->bwt); idx->bwt = 0; + + // copy idx->bns + tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t); + for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory + tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2; + mem = realloc(mem, k + sizeof(bntseq_t) + tmp); + x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x; + free(idx->bns->ambs); + x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x; + x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x; + free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno); + } + free(idx->bns->anns); + + // copy idx->pac + x = idx->bns->l_pac/4+1; + mem = realloc(mem, k + x); + memcpy(mem + k, idx->pac, x); k += x; + free(idx->bns); idx->bns = 0; + free(idx->pac); idx->pac = 0; + + return bwa_mem2idx(k, mem, idx); +} + /*********************** * SAM header routines * ***********************/ diff --git a/bwa.h b/bwa.h index bbc2525..7e2c153 100644 --- a/bwa.h +++ b/bwa.h @@ -14,6 +14,9 @@ typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + + int64_t l_mem; + const uint8_t *mem; } bwaidx_t; typedef struct { @@ -39,6 +42,8 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + int bwa_idx2mem(bwaidx_t *idx); + int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); char *bwa_set_rg(const char *s); diff --git a/main.c b/main.c index 77cace1..9108c49 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r896-dirty" +#define PACKAGE_VERSION "0.7.10-r898-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 6a0952948dd30bdc8151d4950ec51f5740c3cdbe Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 15 Oct 2014 14:44:08 -0400 Subject: [PATCH 21/31] shared memory --- Makefile | 2 +- bwa.c | 4 ++-- bwa.h | 8 ++++++-- main.c | 3 +++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 9bccbf6..8a10df6 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bwa.c b/bwa.c index e2e1376..65e3c16 100644 --- a/bwa.c +++ b/bwa.c @@ -263,11 +263,11 @@ void bwa_idx_destroy(bwaidx_t *idx) if (idx->bwt) bwt_destroy(idx->bwt); if (idx->bns) bns_destroy(idx->bns); if (idx->pac) free(idx->pac); - } else free((uint8_t*)idx->mem); + } else if (!idx->is_shm) free(idx->mem); free(idx); } -int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx) +int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) { int64_t k = 0, x; int i; diff --git a/bwa.h b/bwa.h index 7e2c153..094d79b 100644 --- a/bwa.h +++ b/bwa.h @@ -10,13 +10,17 @@ #define BWA_IDX_PAC 0x4 #define BWA_IDX_ALL 0x7 +#define BWA_SHM_KEY 5291 +#define BWA_SHM_SIZE 0x10000 + typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + int is_shm; int64_t l_mem; - const uint8_t *mem; + uint8_t *mem; } bwaidx_t; typedef struct { @@ -43,7 +47,7 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); int bwa_idx2mem(bwaidx_t *idx); - int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx); + int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); char *bwa_set_rg(const char *s); diff --git a/main.c b/main.c index 9108c49..30ac54d 100644 --- a/main.c +++ b/main.c @@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); +int main_shm(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]); @@ -43,6 +44,7 @@ static int usage() fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); + fprintf(stderr, " shm manage indices in shared memory\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); @@ -81,6 +83,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1); else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); From bfd5e1840fa275fdf8abad6f397c35283d52ad0d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 15 Oct 2014 15:44:06 -0400 Subject: [PATCH 22/31] shm works on small files, but not large ones I don't know why. SHMMAX, SHMALL and SHMMNI are large enough. --- bwa.c | 21 +++++++++++++++------ bwa.h | 2 ++ fastmap.c | 6 +++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/bwa.c b/bwa.c index 65e3c16..431cc49 100644 --- a/bwa.c +++ b/bwa.c @@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) return bwt; } -bwaidx_t *bwa_idx_load(const char *hint, int which) +bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) { bwaidx_t *idx; char *prefix; @@ -256,6 +256,11 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) return idx; } +bwaidx_t *bwa_idx_load(const char *hint, int which) +{ + return bwa_idx_load_from_disk(hint, which); +} + void bwa_idx_destroy(bwaidx_t *idx) { if (idx == 0) return; @@ -263,7 +268,10 @@ void bwa_idx_destroy(bwaidx_t *idx) if (idx->bwt) bwt_destroy(idx->bwt); if (idx->bns) bns_destroy(idx->bns); if (idx->pac) free(idx->pac); - } else if (!idx->is_shm) free(idx->mem); + } else { + free(idx->bwt); free(idx->bns->anns); free(idx->bns); + if (!idx->is_shm) free(idx->mem); + } free(idx); } @@ -273,19 +281,20 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) int i; // generate idx->bwt - x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x; + x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; // generate idx->bns and idx->pac - x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x; + x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; - x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x; + x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x; for (i = 0; i < idx->bns->n_seqs; ++i) { idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; } - idx->pac = (uint8_t*)(mem + k); + idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1; + assert(k == l_mem); idx->l_mem = k; idx->mem = mem; return 0; diff --git a/bwa.h b/bwa.h index 094d79b..5d40aae 100644 --- a/bwa.h +++ b/bwa.h @@ -44,6 +44,8 @@ extern "C" { char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load_from_shm(const char *hint); + bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which); bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); int bwa_idx2mem(bwaidx_t *idx); diff --git a/fastmap.c b/fastmap.c index 43ddfd3..e949da3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[]) } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + idx = bwa_idx_load_from_shm(argv[optind]); + if (idx == 0) { + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + } else if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); if (ignore_alt) for (i = 0; i < idx->bns->n_seqs; ++i) idx->bns->anns[i].is_alt = 0; From 6804f17d38eb64b8d2f9b7b694e6baa7f1cc93be Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 15 Oct 2014 23:06:03 -0400 Subject: [PATCH 23/31] replaced Sys V shm with POSIX shm --- bwa.h | 3 +- bwashm.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 bwashm.c diff --git a/bwa.h b/bwa.h index 5d40aae..6fcc82e 100644 --- a/bwa.h +++ b/bwa.h @@ -10,8 +10,7 @@ #define BWA_IDX_PAC 0x4 #define BWA_IDX_ALL 0x7 -#define BWA_SHM_KEY 5291 -#define BWA_SHM_SIZE 0x10000 +#define BWA_CTL_SIZE 0x10000 typedef struct { bwt_t *bwt; // FM-index diff --git a/bwashm.c b/bwashm.c new file mode 100644 index 0000000..683d608 --- /dev/null +++ b/bwashm.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bwa.h" + +int bwa_shm_stage(bwaidx_t *idx, const char *hint) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt, i; + int shmid, to_init = 0, l; + char path[PATH_MAX + 1], *p; + + if (hint == 0 || hint[0] == 0) return -1; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + + if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) { + shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644); + to_init = 1; + } + if (shmid < 0) return -1; + ftruncate(shmid, BWA_CTL_SIZE); + shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (to_init) { + memset(shm, 0, BWA_CTL_SIZE); + cnt[1] = 4; + } else { + for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) { + if (strcmp(p + 8, name) == 0) break; + p += 9 + strlen(p + 8); + } + if (i < cnt[0]) { + fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name); + return -1; + } + } + + if (idx->mem == 0) bwa_idx2mem(idx); + + strcat(strcpy(path, "/bwaidx-"), name); + l = 8 + strlen(name) + 1; + if (cnt[1] + l > BWA_CTL_SIZE) return -1; + memcpy(shm + cnt[1], &idx->l_mem, 8); + memcpy(shm + cnt[1] + 8, name, l - 8); + if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) { + shm_unlink(path); + perror("shm_open()"); + return -1; + } + cnt[1] += l; ++cnt[0]; + ftruncate(shmid, idx->l_mem); + shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + memcpy(shm_idx, idx->mem, idx->l_mem); + free(idx->mem); + idx->mem = shm_idx; + idx->is_shm = 1; + return 0; +} + +bwaidx_t *bwa_idx_load_from_shm(const char *hint) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt, i; + char *p, path[PATH_MAX + 1]; + int shmid; + int64_t l_mem; + bwaidx_t *idx; + + if (hint == 0 || hint[0] == 0) return 0; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (cnt[0] == 0) return 0; + for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { + memcpy(&l_mem, p, 8); p += 8; + if (strcmp(p, name) == 0) break; + p += strlen(p) + 1; + } + if (i == cnt[0]) return 0; + + strcat(strcpy(path, "/bwaidx-"), name); + if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0; + shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); + idx = calloc(1, sizeof(bwaidx_t)); + bwa_mem2idx(l_mem, shm_idx, idx); + idx->is_shm = 1; + return idx; +} + +int bwa_shm_list(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + printf("%s\t%ld\n", p, (long)l_mem); + p += strlen(p) + 1; + } + return 0; +} + +int bwa_shm_destroy(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + char path[PATH_MAX + 1]; + + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + strcat(strcpy(path, "/bwaidx-"), p); + shm_unlink(path); + p += strlen(p) + 1; + } + munmap(shm, BWA_CTL_SIZE); + shm_unlink("/bwactl"); + return 0; +} + +int main_shm(int argc, char *argv[]) +{ + int c, to_list = 0, to_drop = 0, ret = 0; + while ((c = getopt(argc, argv, "ld")) >= 0) { + if (c == 'l') to_list = 1; + else if (c == 'd') to_drop = 1; + } + if (optind == argc && !to_list && !to_drop) { + fprintf(stderr, "\nUsage: bwa shm [-d|-l] [idxbase]\n\n"); + fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); + fprintf(stderr, " -l list names of indices in shared memory\n\n"); + return 1; + } + if (optind < argc && (to_list || to_drop)) { + fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__); + return 1; + } + if (optind < argc) { + bwaidx_t *idx; + idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL); + if (bwa_shm_stage(idx, argv[optind]) < 0) { + fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__); + ret = 1; + } + bwa_idx_destroy(idx); + } + if (to_list) bwa_shm_list(); + if (to_drop) bwa_shm_destroy(); + return ret; +} From 57436ab40517462c72cd06a4914dc4e9957e22d0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 15 Oct 2014 23:23:01 -0400 Subject: [PATCH 24/31] added -lrt on Linux --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8a10df6..d0d50cf 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,10 @@ INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . +ifeq ($(shell uname -s),Linux) + LIBS += -lrt +endif + .SUFFIXES:.c .o .cc .c.o: @@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h bwase.o: bwa.h ksw.h bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwashm.o: bwa.h bntseq.h bwt.h bwt.o: utils.h bwt.h kvec.h malloc_wrap.h bwt_gen.o: QSufSort.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h @@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h bwtsw2_pair.o: malloc_wrap.h ksw.h -cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h is.o: malloc_wrap.h From 97a3102c89dc180aa088ae1c96ba05fe80131530 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 10:26:25 -0400 Subject: [PATCH 25/31] r903: updated revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 30ac54d..ef6b66d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r898-dirty" +#define PACKAGE_VERSION "0.7.10-r903-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From ad0da1418fe4181ce85a8ef117c47ca53eb6e954 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 10:52:59 -0400 Subject: [PATCH 26/31] r904: optionally create tmp files for shm staging --- bwashm.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- main.c | 2 +- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/bwashm.c b/bwashm.c index 683d608..4d60350 100644 --- a/bwashm.c +++ b/bwashm.c @@ -9,13 +9,13 @@ #include #include "bwa.h" -int bwa_shm_stage(bwaidx_t *idx, const char *hint) +int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn) { const char *name; uint8_t *shm, *shm_idx; uint16_t *cnt, i; int shmid, to_init = 0, l; - char path[PATH_MAX + 1], *p; + char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn; if (hint == 0 || hint[0] == 0) return -1; for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); @@ -45,6 +45,22 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint) if (idx->mem == 0) bwa_idx2mem(idx); + if (tmpfn) { + FILE *fp; + if ((fp = fopen(tmpfn, "wb")) != 0) { + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + free(idx->mem); idx->mem = 0; + } else { + fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__); + tmpfn = 0; + } + } + strcat(strcpy(path, "/bwaidx-"), name); l = 8 + strlen(name) + 1; if (cnt[1] + l > BWA_CTL_SIZE) return -1; @@ -58,9 +74,21 @@ int bwa_shm_stage(bwaidx_t *idx, const char *hint) cnt[1] += l; ++cnt[0]; ftruncate(shmid, idx->l_mem); shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); - memcpy(shm_idx, idx->mem, idx->l_mem); - free(idx->mem); - idx->mem = shm_idx; + if (tmpfn) { + FILE *fp; + fp = fopen(tmpfn, "rb"); + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + unlink(tmpfn); + } else { + memcpy(shm_idx, idx->mem, idx->l_mem); + free(idx->mem); + } + bwa_mem2idx(idx->l_mem, shm_idx, idx); idx->is_shm = 1; return 0; } @@ -140,14 +168,17 @@ int bwa_shm_destroy(void) int main_shm(int argc, char *argv[]) { int c, to_list = 0, to_drop = 0, ret = 0; - while ((c = getopt(argc, argv, "ld")) >= 0) { + char *tmpfn = 0; + while ((c = getopt(argc, argv, "ldf:")) >= 0) { if (c == 'l') to_list = 1; else if (c == 'd') to_drop = 1; + else if (c == 'f') tmpfn = optarg; } if (optind == argc && !to_list && !to_drop) { - fprintf(stderr, "\nUsage: bwa shm [-d|-l] [idxbase]\n\n"); - fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); - fprintf(stderr, " -l list names of indices in shared memory\n\n"); + fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); + fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); + fprintf(stderr, " -l list names of indices in shared memory\n"); + fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n"); return 1; } if (optind < argc && (to_list || to_drop)) { @@ -157,7 +188,7 @@ int main_shm(int argc, char *argv[]) if (optind < argc) { bwaidx_t *idx; idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL); - if (bwa_shm_stage(idx, argv[optind]) < 0) { + if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) { fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__); ret = 1; } diff --git a/main.c b/main.c index ef6b66d..994b6ba 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r903-dirty" +#define PACKAGE_VERSION "0.7.10-r904-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From e318d8e7e5c86988f2d928f7196ba96c0e453305 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 11:22:09 -0400 Subject: [PATCH 27/31] r905: lower peak RAM for "shm -f" --- bwa.c | 17 ++++++++--------- main.c | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bwa.c b/bwa.c index 431cc49..30e5284 100644 --- a/bwa.c +++ b/bwa.c @@ -282,8 +282,8 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) // generate idx->bwt x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; - x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; // generate idx->bns and idx->pac x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; @@ -303,17 +303,16 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) int bwa_idx2mem(bwaidx_t *idx) { int i; - int64_t k = 0, x, tmp; - uint8_t *mem = 0; + int64_t k, x, tmp; + uint8_t *mem; // copy idx->bwt - mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t)); - x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x; - x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x; + x = idx->bwt->bwt_size * 4; + mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0; + memmove(mem + sizeof(bwt_t), mem, x); + memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x; + x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x; free(idx->bwt->sa); - mem = realloc(mem, k + idx->bwt->bwt_size * 4); - x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x; - free(idx->bwt->bwt); free(idx->bwt); idx->bwt = 0; // copy idx->bns diff --git a/main.c b/main.c index 994b6ba..6350849 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r904-dirty" +#define PACKAGE_VERSION "0.7.10-r905-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From d8d8b230d182974e59cead37127716f378f0da69 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 15:15:23 -0400 Subject: [PATCH 28/31] r906: don't reduce non-ALT mapQ by default --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 79785f4..08588d5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -79,7 +79,7 @@ mem_opt_t *mem_opt_init() o->min_chain_weight = 0; o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); - o->min_pa_ratio = 0.8; + o->min_pa_ratio = 0; bwa_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/main.c b/main.c index 6350849..9eb3dbc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r905-dirty" +#define PACKAGE_VERSION "0.7.10-r906-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 76a365a95fe0c1b79d5b3fc46489b4c31e28ec31 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 15:56:33 -0400 Subject: [PATCH 29/31] r907: revert to -g.8 by default --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 08588d5..79785f4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -79,7 +79,7 @@ mem_opt_t *mem_opt_init() o->min_chain_weight = 0; o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); - o->min_pa_ratio = 0; + o->min_pa_ratio = 0.8; bwa_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/main.c b/main.c index 9eb3dbc..313278c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r906-dirty" +#define PACKAGE_VERSION "0.7.10-r907-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From fab1f2b561ef67670116010236013d945ef343ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 17:15:39 -0400 Subject: [PATCH 30/31] README on ALT mapping --- README-alt.md | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 README-alt.md diff --git a/README-alt.md b/README-alt.md new file mode 100644 index 0000000..7a578f9 --- /dev/null +++ b/README-alt.md @@ -0,0 +1,123 @@ +## Getting Started + +Since version 0.7.11, BWA-MEM supports read mapping against a reference genome +with long alternative haplotypes present in separate ALT contigs. To use the +ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the +SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is +available from the [BWA resource bundle for GRCh38][res]. + +### Option 1: Mapping to the official GRCh38 with ALT contigs + +Construct the index: +```sh +wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz +gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz +mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa +bwa index hs38a.fa +cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt +``` + +Perform mapping: +```sh +bwa mem hs38a.fa read1.fq read2.fq \ + | samblaster \ + | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \ + | samtools view -bS - > aln.unsrt.bam +``` +For short reads, the postprocessing script `bwa-postalt.js` runs at about the +same speed as BAM compression. + +### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes + +Construct the index: +```sh +cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa +bwa index hs38d4.fa +cp bwa-hs38-res/hs38d4.fa.alt . +``` +Perform mapping: +```sh +bwa mem -g.8 hs38d4.fa read1.fq read2.fq \ + | samblaster \ + | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ + | samtools view -bS - > aln.unsrt.bam +``` +This command line generates `postinfo.ctw` which loosely evaluates the presence +of an ALT contig with an empirical score at the last column. + +## Background + +GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions. +However, sequences that are highly diverged from the primary assembly only +contribute a few million bp. Most subsequences of ALT contigs are highly similar +or identical to the primary assembly. If we align sequence reads to GRCh38+ALT +treating ALT equal to the primary assembly, we will get many reads with zero +mapping quality and lose variants on them. It is crucial to make the mapper +aware of ALTs. + +BWA-MEM is designed to minimize the interference of ALT contigs such that on the +primary assembly, the ALT-aware alignment is highly similar to the alignment +without using ALT contigs in the index. This design choice makes it almost +always safe to map reads to GRCh38+ALT. Although we don't know yet how much +variations on ALT contigs contribute to phenotypes, we would not get the answer +without mapping large cohorts to these extra sequences. We hope our current +implementation encourages researchers to use ALT contigs soon and often. + +## Methods + +As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and +postprocessing. + +### Step 1: BWA-MEM mapping + +At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring +the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*, +depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports +alignments and assigns mapQ following these two rules: + + * The original mapQ of a non-ALT hit is computed across non-ALT hits only. + The reported mapQ of an ALT hit is computed across all hits. + + * An ALT hit is only reported if its score is strictly better than all + overlapping non-ALT hits. A reported ALT hit is flagged with 0x800 + (supplementary) unless there are no non-ALT hits. + +When option `-g FLOAT` is in use (which is the default), a third rule kicks in: + + * The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT + times the score of an overlapping ALT hit. In this case, the original mapQ is + moved to the `om` tag. + +If we don't care about ALT hits, we may actually skip postprocessing (step 2). +Nonetheless, postprocessing is recommended as it improves mapQ and gives more +information about ALT hits. + +### Step 2: Postprocessing + +Postprocessing is done with a separate script `bwa-postalt.js`. It reads all +potential hits reported in the XA tag, lifts ALT hits to the chromosomal +positions using the ALT-to-ref alignment, groups them after lifting and then +reassigns mapQ based on the best scoring hit in each group with all the hits in +a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can +greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. + +The script also measures the presence of each ALT contig. For a group of +overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j +P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i +pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a +Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the +option 2 above. + +## Problems and Future Development + +There are some uncertainties about ALT mappings - we are not sure whether they +help biological discovery and don't know the best way to analyze them. Without +clear demand from downstream analyses, it is very difficult to design the +optimal mapping strategy. The current BWA-MEM method is just a start. If it +turns out to be useful in research, we will probably rewrite bwa-postalt.js in C +for performance; if not, we will try new designs. It is also possible that we +may make breakthrough on the representation of multiple genomes, in which case, +we can even get rid of ALT contigs once for all. + +[res]: https://sourceforge.net/projects/bio-bwa/files/ +[sb]: https://github.com/GregoryFaust/samblaster From 67c2d69218dcaae81b1d7c66a1b8d63dab0a5c5b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 16 Oct 2014 17:17:02 -0400 Subject: [PATCH 31/31] made the header smaller --- README-alt.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README-alt.md b/README-alt.md index 7a578f9..8dac185 100644 --- a/README-alt.md +++ b/README-alt.md @@ -6,7 +6,7 @@ ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is available from the [BWA resource bundle for GRCh38][res]. -### Option 1: Mapping to the official GRCh38 with ALT contigs +#### Option 1: Mapping to the official GRCh38 with ALT contigs Construct the index: ```sh @@ -27,7 +27,7 @@ bwa mem hs38a.fa read1.fq read2.fq \ For short reads, the postprocessing script `bwa-postalt.js` runs at about the same speed as BAM compression. -### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes +#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes Construct the index: ```sh @@ -68,7 +68,7 @@ implementation encourages researchers to use ALT contigs soon and often. As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and postprocessing. -### Step 1: BWA-MEM mapping +#### Step 1: BWA-MEM mapping At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*, @@ -92,7 +92,7 @@ If we don't care about ALT hits, we may actually skip postprocessing (step 2). Nonetheless, postprocessing is recommended as it improves mapQ and gives more information about ALT hits. -### Step 2: Postprocessing +#### Step 2: Postprocessing Postprocessing is done with a separate script `bwa-postalt.js`. It reads all potential hits reported in the XA tag, lifts ALT hits to the chromosomal