From e5355fe3a055307aaf8f6868770573a6c6aef0b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Mar 2013 22:01:26 -0400 Subject: [PATCH 1/4] r364: bug in mem pairing (no effect with -A=1) Forgot to adjust for matching score. This bug has no effect when -A takes the default value. --- bwamem_pair.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 51a844b..23c99ef 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -201,7 +201,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) if (q < 0) q = 0; p = kv_pushp(pair64_t, u); p->y = (uint64_t)k<<32 | i; diff --git a/main.c b/main.c index c5b84be..905161f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r363-beta" +#define PACKAGE_VERSION "0.7.2-r364-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From dd511778374b100a98be761e698b2ba5be2bfa4b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 11:59:05 -0400 Subject: [PATCH 2/4] r365: bugfix - wrong alignment (right mapping) The bug only happens when there is a 1bp del and 1bp ins which are close to the end and there are no other substitutions or indels. In this case, bwa mem gave a wrong band width. --- bwamem.c | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index b6a32f7..da9f2be 100644 --- a/bwamem.c +++ b/bwamem.c @@ -556,7 +556,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int prev = a->score; aw[0] = opt->w << i; a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); + if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query @@ -574,7 +574,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int prev = a->score; aw[1] = opt->w << i; a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); + if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above @@ -601,7 +601,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) { int w; - if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps + if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); if (w < abs(l1 - l2)) w = abs(l1 - l2); return w; diff --git a/main.c b/main.c index 905161f..4761bc9 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r364-beta" +#define PACKAGE_VERSION "0.7.2-r365-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 7dec00c217268d1eebf9dd0fc42c73aa91800695 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 12:51:53 -0400 Subject: [PATCH 3/4] Release BWA-0.7.3-r366 --- NEWS | 31 +++++++++++++++++++++++++++++++ bwa.1 | 4 ++-- main.c | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 7474726..788ae40 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,34 @@ +Release 0.7.3 (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: pairing score is inaccurate when option -A does not take the default + value. This is a very minor issue even if it happens. + + * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there + is a 1bp deletion and a 1bp insertion which are close to the end of the + reads, and there are no other substitutions or indels. BWA-MEM would not do + a gapped alignment due to the bug. + + * New feature: output other non-overlapping alignments in the XP tag such that + we can see the entire picture of alignment from one SAM line. XP gives the + position, CIGAR, NM and mapQ of each aligned subsequence of the query. + +BWA-MEM has been used to align ~300Gbp 100-700bp SE/PE reads. SNP/indel calling +has also been evaluated on part of these data. BWA-MEM generally gives better +pre-filtered SNP calls than BWA. No significant issues have been observed since +0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) +are still possible. If you find potential issues, please send bug reports to + (free registration required). + +In addition, more detailed description of the BWA-MEM algorithm can be found at +. + +(0.7.3: 15 March 2013, r366) + + + Release 0.7.2 (9 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index f01dce1..a02aebe 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "13 March 2013" "bwa-0.7.3" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.3" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -580,7 +580,7 @@ XS Suboptimal alignment score XF Support from forward/reverse alignment XE Number of supporting seeds _ -XP Alt primary hits; format: /(chr,pos,CIGAR;mapQ,NM;)+/ +XP Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/ .TE .PP diff --git a/main.c b/main.c index 4761bc9..a03c49f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r365-beta" +#define PACKAGE_VERSION "0.7.3-r366" #endif int bwa_fa2pac(int argc, char *argv[]); From 9346acde1b2c930e13948b7cb43076f2ebc63dfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 21:26:37 -0400 Subject: [PATCH 4/4] Release bwa-0.7.3a-r367 In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed in another corner case. --- NEWS | 10 ++++++++++ bwa.1 | 2 +- bwamem.c | 24 +++++++++++++++++------- bwamem.h | 3 ++- main.c | 2 +- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 788ae40..0cf0591 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +Release 0.7.3a (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed +in another corner case. + +(0.7.3a: 15 March 2013, r367) + + + Release 0.7.3 (15 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index a02aebe..1edbf12 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "15 March 2013" "bwa-0.7.3" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.3a" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/bwamem.c b/bwamem.c index da9f2be..82dbe45 100644 --- a/bwamem.c +++ b/bwamem.c @@ -542,7 +542,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; - a->score = -1; + a->score = a->truesc = -1; if (s->qbeg) { // left extension uint8_t *rs, *qs; @@ -560,10 +560,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits - else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; + a->truesc = a->score; + } else { // to-end extension + a->qb = 0, a->rb = s->rbeg - gtle; + a->truesc = gscore; + } free(qs); free(rs); - } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re, gtle, gscore, sc0 = a->score; @@ -578,8 +583,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; - else a->qe = l_query, a->re = rmax[0] + re + gtle; + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + a->qe = qe + qle, a->re = rmax[0] + re + tle; + a->truesc += a->score - sc0; + } else { // to-end extension + a->qe = l_query, a->re = rmax[0] + re + gtle; + a->truesc += gscore - sc0; + } } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } @@ -839,7 +849,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x20000; bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); - w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); + w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; diff --git a/bwamem.h b/bwamem.h index ae201e6..b7a4e79 100644 --- a/bwamem.h +++ b/bwamem.h @@ -43,7 +43,8 @@ typedef struct { typedef struct { int64_t rb, re; // [rb,re): reference sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment - int score; // best SW score + int score; // best local SW score + int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits diff --git a/main.c b/main.c index a03c49f..d9e75e2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r366" +#define PACKAGE_VERSION "0.7.3a-r367" #endif int bwa_fa2pac(int argc, char *argv[]);