From 6f37c14f26dc7015fcb3e4ee7e5f0aa8efcfefbc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 16 Sep 2014 18:52:49 -0400 Subject: [PATCH 1/3] r848: tag alignments with primary ALT --- bwamem.c | 4 ++++ bwamem.h | 2 +- main.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 64c26fa..1699e97 100644 --- a/bwamem.c +++ b/bwamem.c @@ -880,11 +880,13 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them + int has_pri_alt = 0; kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + if (list[i].is_alt) has_pri_alt = 1; kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); @@ -895,6 +897,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputc(',', str); kputw(r->NM, str); kputc(';', str); } + if (has_pri_alt) kputsn("\tpa:A:Y", 7, str); } } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } @@ -1094,6 +1097,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * assert(a.rid == ar->rid); a.pos = pos - bns->anns[a.rid].offset; a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; + a.is_alt = ar->is_alt; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index 8b8ec0d..20e8d05 100644 --- a/bwamem.h +++ b/bwamem.h @@ -85,7 +85,7 @@ typedef struct { // This struct is only used for the convenience of API. int64_t pos; // forward strand 5'-end mapping position int rid; // reference sequence index in bntseq_t; <0 for unmapped int flag; // extra flag - uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 char *XA; // alternative mappings diff --git a/main.c b/main.c index 35e598b..ca71750 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r845-dirty" +#define PACKAGE_VERSION "0.7.10-r848-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 825ae92e58527c8eff4523f1cb970dd08459a3dd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Sep 2014 13:05:35 -0400 Subject: [PATCH 2/3] r849: the pa tag now gives a number ... which is the ratio of this hit to the best ALT hit. --- NEWS.md | 6 ++++-- bwamem.c | 7 ++++--- bwamem_extra.c | 8 +------- main.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index cc7946c..095b2de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,8 +12,10 @@ having ALT contigs almost has not effect on alignments to the primary assembly (seeding may be affected in rare corner cases). At the same time, users may get a primary alignment to ALT contigs (no 0x800 flag) if there are no good hits to the primary assembly, or get a supplementary alignment to ALT contigs -if it is better than hits to the primary assembly. Since this release, it is -recommended to include ALT contigs. +if it is better than hits to the primary assembly. In the latter case, the +`pa:f` tag shows the ratio of the primary hit score to the best ALT hit score, +which is no greater than 1. Since this release, it is recommended to include +ALT contigs. Users may consider to use ALT contigs from GRCh38. I am also constructing a non-redundant and more complete set of sequences missing from GRCh38. diff --git a/bwamem.c b/bwamem.c index 1699e97..533565b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -880,13 +880,13 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them - int has_pri_alt = 0; + int pri_alt_sc = -1; kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - if (list[i].is_alt) has_pri_alt = 1; + if (list[i].is_alt) pri_alt_sc = pri_alt_sc > r->score? pri_alt_sc : r->score; kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); @@ -897,7 +897,8 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputc(',', str); kputw(r->NM, str); kputc(';', str); } - if (has_pri_alt) kputsn("\tpa:A:Y", 7, str); + if (pri_alt_sc > 0) + ksprintf(str, "\tpa:f:%.3f", (double)p->score / pri_alt_sc); } } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } diff --git a/bwamem_extra.c b/bwamem_extra.c index 45289ea..9e9122d 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -59,19 +59,13 @@ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv) const bwtintv_v *smem_next(smem_i *itr) { - int i, max, max_i, ori_start; + int ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } return itr->matches; } diff --git a/main.c b/main.c index ca71750..123f0d4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r848-dirty" +#define PACKAGE_VERSION "0.7.10-r849-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From f5a0e576e6a40a8ec6fcb65b4829c5d999d18927 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Sep 2014 14:18:40 -0400 Subject: [PATCH 3/3] added postalt to bwa-helper for post processing --- bwa-helper.js | 223 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 156 insertions(+), 67 deletions(-) diff --git a/bwa-helper.js b/bwa-helper.js index fc738e9..daa1dd8 100644 --- a/bwa-helper.js +++ b/bwa-helper.js @@ -103,6 +103,8 @@ Bytes.prototype.revcomp = function() this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; } +var re_cigar = /(\d+)([MIDSHN])/g; + /************************ *** command markovlp *** ************************/ @@ -432,80 +434,65 @@ function intv_ovlp(intv, bits) // interval index } } -function bwa_genalt(args) +function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF { - var re_cigar = /(\d+)([MIDSHN])/g; - - function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF - { - var x = 0, y = 0; - for (var i = 0; i < cigar.length; ++i) { - var op = cigar[i][0], len = cigar[i][1]; - if (op == 'M') { - if (y <= pos && pos < y + len) - return x + (pos - y); - x += len, y += len; - } else if (op == 'D') { - x += len; - } else if (op == 'I') { - if (y <= pos && pos < y + len) - return x; - y += len; - } else if (op == 'S' || op == 'H') { - if (y <= pos && pos < y + len) - return -1; - y += len; - } + var x = 0, y = 0; + for (var i = 0; i < cigar.length; ++i) { + var op = cigar[i][0], len = cigar[i][1]; + if (op == 'M') { + if (y <= pos && pos < y + len) + return x + (pos - y); + x += len, y += len; + } else if (op == 'D') { + x += len; + } else if (op == 'I') { + if (y <= pos && pos < y + len) + return x; + y += len; + } else if (op == 'S' || op == 'H') { + if (y <= pos && pos < y + len) + return -1; + y += len; } - return -1; } + return -1; +} - function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5] - { - var h = {}; - h.ctg = s[0]; - h.start = parseInt(s[1].substr(1)) - 1; - h.rev = (s[1].charAt(0) == '-'); - h.cigar = s[2]; - h.NM = parseInt(s[3]); - h.hard = false; - var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip; - l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0; - while ((m = re_cigar.exec(h.cigar)) != null) { - var l = parseInt(m[1]); - if (m[2] == 'M') l_match += l; - else if (m[2] == 'D') ++n_del, l_del += l; - else if (m[2] == 'I') ++n_ins, l_ins += l; - else if (m[2] == 'N') l_skip += l; - else if (m[2] == 'H' || m[2] == 'S') { - l_clip += l; - if (m[2] == 'H') h.hard = true; - } +function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5] +{ + var h = {}; + h.ctg = s[0]; + h.start = parseInt(s[1].substr(1)) - 1; + h.rev = (s[1].charAt(0) == '-'); + h.cigar = s[2]; + h.NM = parseInt(s[3]); + h.hard = false; + var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip; + l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0; + while ((m = re_cigar.exec(h.cigar)) != null) { + var l = parseInt(m[1]); + if (m[2] == 'M') l_match += l; + else if (m[2] == 'D') ++n_del, l_del += l; + else if (m[2] == 'I') ++n_ins, l_ins += l; + else if (m[2] == 'N') l_skip += l; + else if (m[2] == 'H' || m[2] == 'S') { + l_clip += l; + if (m[2] == 'H') h.hard = true; } - h.end = h.start + l_match + l_del + l_skip; - h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins; - h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499); - h.l_query = l_match + l_ins + l_clip; - return h; } + h.end = h.start + l_match + l_del + l_skip; + h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins; + h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499); + h.l_query = l_match + l_ins + l_clip; + return h; +} - var c, opt = { a:1, b:4, o:6, e:1, verbose:3 }; - - while ((c = getopt(args, 'v:')) != null) { - if (c == 'v') opt.verbose = parseInt(getopt.arg); - } - - if (args.length == getopt.ind) { - print("Usage: k8 bwa-helper.js genalt [aln.sam]"); - exit(1); - } - - var file, buf = new Bytes(); - var aux = new Bytes(); // used for reverse and reverse complement - - // read the ALT-to-REF alignment and generate the index +// read the ALT-to-REF alignment and generate the index +function read_ALT_sam(fn) +{ var intv = {}; - file = new File(args[getopt.ind]); + var file = new File(fn); + var buf = new Bytes(); while (file.readline(buf) >= 0) { var line = buf.toString(); var t = line.split("\t"); @@ -524,11 +511,31 @@ function bwa_genalt(args) intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar); } + buf.destroy(); file.close(); // create the interval index var idx = {}; for (var ctg in intv) idx[ctg] = intv_ovlp(intv[ctg]); + return idx; +} + +function bwa_genalt(args) +{ + var c, opt = { a:1, b:4, o:6, e:1, verbose:3 }; + + while ((c = getopt(args, 'v:')) != null) { + if (c == 'v') opt.verbose = parseInt(getopt.arg); + } + + if (args.length == getopt.ind) { + print("Usage: k8 bwa-helper.js genalt [aln.sam]"); + exit(1); + } + + var file, buf = new Bytes(); + var aux = new Bytes(); // used for reverse and reverse complement + var idx = read_ALT_sam(args[getopt.ind]); // process SAM file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); @@ -682,6 +689,86 @@ function bwa_genalt(args) buf.destroy(); } +// This is in effect a simplified version of bwa_genalt(). +function bwa_postalt(args) +{ + var c, idx = {}, opt = { verbose:3, min_pa:0.8 }; + while ((c = getopt(args, 'v:r:a:')) != null) { + if (c == 'v') opt.verbose = parseInt(getopt.arg); + else if (c == 'r') opt.min_pa = parseFloat(getopt.arg); + else if (c == 'a') idx = read_ALT_sam(getopt.arg); + } + + if (args.length == getopt.ind) { + print("Usage: k8 bwa-helper.js postalt [-r "+opt.min_pa+"] [-a alt.sam] [aln.sam]"); + exit(1); + } + + // process SAM + var file = args.length - getopt.ind >= 1? new File(args[getopt.ind]) : new File(); + var buf = new Bytes(); + while (file.readline(buf) >= 0) { + var m, line = buf.toString(); + if (line.charAt(0) == '@') { + print(line); + continue; + } + + var t = line.split("\t"); + var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; + var flag = parseInt(t[1]); + var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); + + // lift mapping positions to coordinates on the primary assembly + { + var a = null; + if (idx[h.ctg] != null) + a = idx[h.ctg](h.start, h.end); + if (a == null) a = []; + + // find the approximate position on the primary assembly + var lifted = []; + for (var j = 0; j < a.length; ++j) { + var s, e; + if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly + s = cigar2pos(a[j][6], h.start); + e = cigar2pos(a[j][6], h.end - 1) + 1; + } else { + s = cigar2pos(a[j][6], a[j][2] - h.end); + e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1; + } + if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment + s += a[j][5]; e += a[j][5]; + lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); + } + h.lifted = lifted; + } + + // update mapQ if necessary + var ori_mapQ = null; + if ((m = /\tpa:f:(\d+\.\d+)/.exec(line)) != null) { + if (parseFloat(m[1]) < opt.min_pa) + ori_mapQ = t[4], t[4] = 0; + } + + // generate lifted_str + if (h.lifted && h.lifted.length) { + var lifted = h.lifted; + var u = ''; + for (var j = 0; j < lifted.length; ++j) + u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";"; + h.lifted_str = u; + } else h.lifted_str = null; + + // print + if (ori_mapQ != null) t.push("om:i:"+ori_mapQ); + if (h.lifted_str) t.push("lt:Z:" + h.lifted_str); + print(t.join("\t")); + } + buf.destroy(); + file.close(); +} + /********************* *** Main function *** *********************/ @@ -690,7 +777,8 @@ function main(args) { if (args.length == 0) { print("\nUsage: k8 bwa-helper.js [arguments]\n"); - print("Commands: genalt generate ALT alignments"); + print("Commands: postalt post process ALT-aware alignments"); + print(" genalt generate ALT alignments for ALT-unaware alignments"); print(" sam2pas convert SAM to pairwise alignment summary format (PAS)"); print(" pas2reg extract covered regions"); print(" reg2cut regions to extract for the 2nd round bwa-mem"); @@ -708,6 +796,7 @@ function main(args) else if (cmd == 'pas2reg') bwa_pas2reg(args); else if (cmd == 'reg2cut') bwa_reg2cut(args); else if (cmd == 'genalt') bwa_genalt(args); + else if (cmd == 'postalt') bwa_postalt(args); else if (cmd == 'shortname') bwa_shortname(args); else warn("Unrecognized command"); }