diff --git a/Makefile b/Makefile index 5aa07c5..ce2a397 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,7 @@ CC= gcc -CFLAGS= -g -Wall -O2 -msse2 -CXXFLAGS= $(CFLAGS) +CFLAGS= -g -Wall -O2 AR= ar -DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 +DFLAGS= -DHAVE_PTHREAD LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ @@ -17,8 +16,6 @@ SUBDIRS= . .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ -.cc.o: - $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) diff --git a/NEWS b/NEWS index 35202f1..7474726 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,49 @@ +Release 0.7.2 (9 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, +flagging `properly paired' also gets improved a little. + +(0.7.2: 9 March 2013, r351) + + + +Release 0.7.1 (8 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: rare segmentation fault caused by a partial hit to the end of the + last sequence. + + * Bugfix: occasional mis-pairing given an interleaved fastq. + + * Bugfix: wrong mate information when the mate is unmapped. SAM generated by + BWA-MEM can now be validated with Picard. + + * Improved the performance and accuracy for ultra-long query sequences. + Short-read alignment is not affected. + +Changes to other components: + + * In BWA-backtrack and BWA-SW, replaced the code for global alignment, + Smith-Waterman and SW extension. The performance and accuracy of the two + algorithms stay the same. + + * Added an experimental subcommand to merge overlapping paired ends. The + algorithm is very conservative: it may miss true overlaps but rarely makes + mistakes. + +An important note is that like BWA-SW, BWA-MEM may output multiple primary +alignments for a read, which may cause problems to some tools. For aligning +sequence reads, it is advised to use `-M' to flag extra hits as secondary. This +option is not the default because multiple primary alignments are theoretically +possible in sequence alignment. + +(0.7.1: 8 March 2013, r347) + + + Beta Release 0.7.0 (28 Feburary, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -16,14 +62,14 @@ In addition to the algorithmic improvements, BWA-MEM also implements a few handy features in practical aspects: 1. BWA-MEM automatically switches between local and glocal (global wrt reads; - local wrt reference) alignment. It reports the end-to-end glocal alignment - if the glocal alignment is not much worse than the optimal local alignment. - Glocal alignment reduces reference bias. + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. 2. BWA-MEM automatically infers pair orientation from a batch of single-end alignments. It allows more than one orientations if there are sufficient - supporting reads. This feature has not been tested on reads from Illumina - jumping library yet. (EXPERIMENTAL) + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It is possible to convert a name-sorted BAM to an interleaved fastq on the fly @@ -37,8 +83,8 @@ handy features in practical aspects: files without replying on bash features. 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c' - program in the source code directory implements a full single-end mapper in - 50 lines of code. + program in the source code directory implements a full single-end mapper in + 50 lines of code. The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM for production use yet. However, when the implementation becomes stable after a diff --git a/bwa.1 b/bwa.1 index b1bbb2a..8e90848 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" +.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -235,7 +235,7 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. .BI -T \ INT Don't output alignment with score lower than .IR INT . -This option only affects output. [30] +This option affects output and occasionally SAM flag 2. [30] .TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These diff --git a/bwa.c b/bwa.c index 722ca4d..a0ed448 100644 --- a/bwa.c +++ b/bwa.c @@ -75,9 +75,9 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25]) for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base + mat[k++] = -1; // ambiguous base } - for (j = 0; j < 5; ++j) mat[k++] = 0; + for (j = 0; j < 5; ++j) mat[k++] = -1; } // Generate CIGAR when the alignment end points are known diff --git a/bwamem.c b/bwamem.c index 78d77d0..a564b86 100644 --- a/bwamem.c +++ b/bwamem.c @@ -612,7 +612,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; - int64_t pos; + int64_t pos = -1; bwahit_t ptmp, *p = &ptmp; if (!p_) { // in this case, generate an unmapped alignment @@ -628,6 +628,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag @@ -670,9 +671,13 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1 + (p0 > p1? 1 : -1), str); + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputw(0, str); kputc('\t', str); + } else if (m && is_mapped(p)) { // then copy the position + kputsn("\t=\t", 3, str); + kputuw(pos - bns->anns[rid].offset + 1, str); + kputsn("\t0\t", 3, str); } else kputsn("\t*\t0\t0\t", 7, str); if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL kputsn("*\t*", 3, str); @@ -760,7 +765,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); + } else { + bwahit_t h; + memset(&h, 0, sizeof(bwahit_t)); + h.rb = h.re = -1; h.flag = extra_flag; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + } s->sam = str.s; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 837666b..3b70cef 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -260,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) - if (a[i].a[j].secondary < 0) break; + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; is_multi[i] = j < a[i].n? 1 : 0; } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score @@ -306,7 +306,11 @@ no_pairing: if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); - } else h[i].rb = h[i].re = -1; + } else { + memset(&h[i], 0, sizeof(bwahit_t)); + h[i].rb = h[i].re = -1; + h[i].flag = 1<<(6+i) | 1; + } } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/bwt.h b/bwt.h index e7b0f97..c36bf9b 100644 --- a/bwt.h +++ b/bwt.h @@ -29,6 +29,7 @@ #define BWA_BWT_H #include +#include // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 #define OCC_INTV_SHIFT 7 diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index bcb0bcb..88c13c7 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/main.c b/main.c index 20f0a02..090c194 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r341-beta" +#define PACKAGE_VERSION "0.7.2-r351" #endif int bwa_fa2pac(int argc, char *argv[]);