From 017be4540734273d60e33caa38161b35c872bd2a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:06:45 -0500 Subject: [PATCH 01/10] r342: bugfix in bwasw - AS is off by one but I do not understand why the old code does not have the same problem. --- Makefile | 5 +---- bwtsw2_aux.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 93e3266..b660557 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -CXXFLAGS= $(CFLAGS) AR= ar -DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 +DFLAGS= -DHAVE_PTHREAD LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ @@ -17,8 +16,6 @@ SUBDIRS= . .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ -.cc.o: - $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 6527495..a84d7e0 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/main.c b/main.c index a772e3b..4240ac7 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r341-beta" +#define PACKAGE_VERSION "0.7.0-r342-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 274c0ac96c749b352144bdc539b058b703489cb1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:40:31 -0500 Subject: [PATCH 02/10] r343: bugfix in mem - wrong mate info for unmap SAM generation is always among the nastiest bits. I would need to refactor at some point (hardly happening). --- bwamem.c | 13 +++++++++++-- bwamem_pair.c | 6 +++++- main.c | 2 +- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7a6d175..766227d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -670,9 +670,13 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1 + (p0 > p1? 1 : -1), str); + kputw(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0), str); } else kputw(0, str); kputc('\t', str); + } else if (m && is_mapped(p)) { // then copy the position + kputsn("\t=\t", 3, str); + kputuw(pos - bns->anns[rid].offset + 1, str); + kputsn("\t0\t", 3, str); } else kputsn("\t*\t0\t0\t", 7, str); if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL kputsn("*\t*", 3, str); @@ -760,7 +764,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); + } else { + bwahit_t h; + memset(&h, 0, sizeof(bwahit_t)); + h.rb = h.re = -1; h.flag = extra_flag; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + } s->sam = str.s; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 49e9ad2..bbbbe02 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -306,7 +306,11 @@ no_pairing: if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); - } else h[i].rb = h[i].re = -1; + } else { + memset(&h[i], 0, sizeof(bwahit_t)); + h[i].rb = h[i].re = -1; + h[i].flag = 1<<(6+i) | 1; + } } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/main.c b/main.c index 4240ac7..1304a35 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r342-beta" +#define PACKAGE_VERSION "0.7.0-r343-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From af7b4d89808f1c48f320f4a2f76d3466d6ff50bc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:45:50 -0500 Subject: [PATCH 03/10] gcc wrongly thinks a variable may be uninitialized It should always be initialized. To avoid a warning, made a change. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 766227d..40b481a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -612,7 +612,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; - int64_t pos; + int64_t pos = -1; bwahit_t ptmp, *p = &ptmp; if (!p_) { // in this case, generate an unmapped alignment From 66c9783dafccd0b202eef2c331c4a9dc8d44fcba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 13:15:43 -0500 Subject: [PATCH 04/10] r345: bugfix in mem - wrong mate strand for unmap Received a clean bill from Picard --- bwamem.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 40b481a..224309f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -628,6 +628,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag diff --git a/main.c b/main.c index 1304a35..3ae3f28 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r343-beta" +#define PACKAGE_VERSION "0.7.0-r345-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 5370bb23a3f9b263aa8c41e9610ad937094863bd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 14:14:42 -0500 Subject: [PATCH 05/10] Updated NEWS; added stddef.h for size_t I thought size_t is defined in stdlib.h, but it is not always. --- NEWS | 50 +++++++++++++++++++++++++++++++++++++++++++------- bwt.h | 1 + 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 35202f1..c94b28e 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,39 @@ +Release 0.7.1 (8 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: rare segmentation fault caused by a partial hit to the end of the + last sequence. + + * Bugfix: occasional mis-pairing given an interleaved fastq. + + * Bugfix: wrong mate information when the mate is unmapped. SAM generated by + BWA-MEM can now be validated with Picard. + + * Improved the performance and accuracy for ultra-long query sequences. + Short-read alignment is not affected. + +Changes to other components: + + * In BWA-backtrack and BWA-SW, replaced the code for global alignment, + Smith-Waterman and SW extension. The performance and accuracy of the two + algorithms stay the same. + + * Added an experimental subcommand to merge overlapping paired ends. The + algorithm is very conservative: it may miss true overlaps but rarely makes + mistakes. + +An important note is that like BWA-SW, BWA-MEM may output multiple primary +alignments for a read, which may cause problems to some tools. For aligning +sequence reads, it is advised to use `-M' to flag extra hits as secondary. This +option is not the default because multiple primary alignments are theoretically +possible in sequence alignment. + +(0.7.1: 8 March 2013, r347) + + + Beta Release 0.7.0 (28 Feburary, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -16,14 +52,14 @@ In addition to the algorithmic improvements, BWA-MEM also implements a few handy features in practical aspects: 1. BWA-MEM automatically switches between local and glocal (global wrt reads; - local wrt reference) alignment. It reports the end-to-end glocal alignment - if the glocal alignment is not much worse than the optimal local alignment. - Glocal alignment reduces reference bias. + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. 2. BWA-MEM automatically infers pair orientation from a batch of single-end alignments. It allows more than one orientations if there are sufficient - supporting reads. This feature has not been tested on reads from Illumina - jumping library yet. (EXPERIMENTAL) + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It is possible to convert a name-sorted BAM to an interleaved fastq on the fly @@ -37,8 +73,8 @@ handy features in practical aspects: files without replying on bash features. 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c' - program in the source code directory implements a full single-end mapper in - 50 lines of code. + program in the source code directory implements a full single-end mapper in + 50 lines of code. The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM for production use yet. However, when the implementation becomes stable after a diff --git a/bwt.h b/bwt.h index e7b0f97..c36bf9b 100644 --- a/bwt.h +++ b/bwt.h @@ -29,6 +29,7 @@ #define BWA_BWT_H #include +#include // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 #define OCC_INTV_SHIFT 7 From 1d132a546de2beafa01bd072c1dea4008e594797 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 15:30:06 -0500 Subject: [PATCH 06/10] Release 0.7.1-r347 --- bwa.1 | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.1 b/bwa.1 index b1bbb2a..495df69 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" +.TH bwa 1 "8 March 2013" "bwa-0.7.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 3ae3f28..505cfbd 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r345-beta" +#define PACKAGE_VERSION "0.7.1-r347" #endif int bwa_fa2pac(int argc, char *argv[]); From 9ea7f83974498297b9876f9c76e93ef42e76d2a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:03:15 -0500 Subject: [PATCH 07/10] Emergent bugfix: wrong TLEN sign It is interesting that Picard did not find the issue. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 224309f..9350943 100644 --- a/bwamem.c +++ b/bwamem.c @@ -671,7 +671,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0), str); + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputw(0, str); kputc('\t', str); } else if (m && is_mapped(p)) { // then copy the position From 740d2c131494fcc57a8084c2f6e5f51ad846d284 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:03:57 -0500 Subject: [PATCH 08/10] Match to 'N' costs -1, instead of 0. This is to prevent alignment through 'N'. --- bwa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index 991b23a..08d96b8 100644 --- a/bwa.c +++ b/bwa.c @@ -75,9 +75,9 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25]) for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base + mat[k++] = -1; // ambiguous base } - for (j = 0; j < 5; ++j) mat[k++] = 0; + for (j = 0; j < 5; ++j) mat[k++] = -1; } // Generate CIGAR when the alignment end points are known From 2d01a297fbccfcd81f306f202bec91dbafa89b03 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:05:50 -0500 Subject: [PATCH 09/10] Improving 'properly paired' flag. If one end has a low quality tail that happens to have a score-20 hit, the pair won't be flagged as properly paired because bwa-mem thought it has multiple hits. By filtering with -T, we won't have this problem. --- bwa.1 | 2 +- bwamem_pair.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwa.1 b/bwa.1 index 495df69..3e92456 100644 --- a/bwa.1 +++ b/bwa.1 @@ -235,7 +235,7 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. .BI -T \ INT Don't output alignment with score lower than .IR INT . -This option only affects output. [30] +This option affects output and occasionally SAM flag 2. [30] .TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These diff --git a/bwamem_pair.c b/bwamem_pair.c index bbbbe02..b9a68f1 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -260,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) - if (a[i].a[j].secondary < 0) break; + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; is_multi[i] = j < a[i].n? 1 : 0; } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score diff --git a/main.c b/main.c index 505cfbd..a49bdf8 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.1-r347" +#define PACKAGE_VERSION "0.7.1-r348-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 5581cb9152876913685fae153dad976c24552ca2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:15:41 -0500 Subject: [PATCH 10/10] Release bwa-0.7.2-r351 For the TLEN sign fix. Sorry for the significant bug in 0.7.0/0.7.1 --- NEWS | 10 ++++++++++ bwa.1 | 2 +- main.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index c94b28e..7474726 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +Release 0.7.2 (9 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, +flagging `properly paired' also gets improved a little. + +(0.7.2: 9 March 2013, r351) + + + Release 0.7.1 (8 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index 3e92456..8e90848 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "8 March 2013" "bwa-0.7.1" "Bioinformatics tools" +.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index a49bdf8..b0874f0 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.1-r348-beta" +#define PACKAGE_VERSION "0.7.2-r351" #endif int bwa_fa2pac(int argc, char *argv[]);