Merge branch 'master' into master_fixes

Conflicts:
	Makefile
This commit is contained in:
Rob Davies 2013-03-11 13:50:49 +00:00
commit 9228e48efd
9 changed files with 81 additions and 23 deletions

View File

@ -1,8 +1,7 @@
CC= gcc CC= gcc
CFLAGS= -g -Wall -O2 -msse2 CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
AR= ar AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 DFLAGS= -DHAVE_PTHREAD
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \
@ -17,8 +16,6 @@ SUBDIRS= .
.c.o: .c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
.cc.o:
$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
all:$(PROG) all:$(PROG)

46
NEWS
View File

@ -1,3 +1,49 @@
Release 0.7.2 (9 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition,
flagging `properly paired' also gets improved a little.
(0.7.2: 9 March 2013, r351)
Release 0.7.1 (8 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes to BWA-MEM:
* Bugfix: rare segmentation fault caused by a partial hit to the end of the
last sequence.
* Bugfix: occasional mis-pairing given an interleaved fastq.
* Bugfix: wrong mate information when the mate is unmapped. SAM generated by
BWA-MEM can now be validated with Picard.
* Improved the performance and accuracy for ultra-long query sequences.
Short-read alignment is not affected.
Changes to other components:
* In BWA-backtrack and BWA-SW, replaced the code for global alignment,
Smith-Waterman and SW extension. The performance and accuracy of the two
algorithms stay the same.
* Added an experimental subcommand to merge overlapping paired ends. The
algorithm is very conservative: it may miss true overlaps but rarely makes
mistakes.
An important note is that like BWA-SW, BWA-MEM may output multiple primary
alignments for a read, which may cause problems to some tools. For aligning
sequence reads, it is advised to use `-M' to flag extra hits as secondary. This
option is not the default because multiple primary alignments are theoretically
possible in sequence alignment.
(0.7.1: 8 March 2013, r347)
Beta Release 0.7.0 (28 Feburary, 2013) Beta Release 0.7.0 (28 Feburary, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4
bwa.1
View File

@ -1,4 +1,4 @@
.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" .TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools"
.SH NAME .SH NAME
.PP .PP
bwa - Burrows-Wheeler Alignment Tool bwa - Burrows-Wheeler Alignment Tool
@ -235,7 +235,7 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
.BI -T \ INT .BI -T \ INT
Don't output alignment with score lower than Don't output alignment with score lower than
.IR INT . .IR INT .
This option only affects output. [30] This option affects output and occasionally SAM flag 2. [30]
.TP .TP
.B -a .B -a
Output all found alignments for single-end or unpaired paired-end reads. These Output all found alignments for single-end or unpaired paired-end reads. These

4
bwa.c
View File

@ -75,9 +75,9 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25])
for (i = k = 0; i < 4; ++i) { for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b; mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base mat[k++] = -1; // ambiguous base
} }
for (j = 0; j < 5; ++j) mat[k++] = 0; for (j = 0; j < 5; ++j) mat[k++] = -1;
} }
// Generate CIGAR when the alignment end points are known // Generate CIGAR when the alignment end points are known

View File

@ -612,7 +612,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
uint32_t *cigar = 0; uint32_t *cigar = 0;
int64_t pos; int64_t pos = -1;
bwahit_t ptmp, *p = &ptmp; bwahit_t ptmp, *p = &ptmp;
if (!p_) { // in this case, generate an unmapped alignment if (!p_) { // in this case, generate an unmapped alignment
@ -628,6 +628,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
} }
p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read
kputs(s->name, str); kputc('\t', str); kputs(s->name, str); kputc('\t', str);
if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
@ -670,9 +671,13 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
if (mid == rid) { if (mid == rid) {
int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
kputw(p0 - p1 + (p0 > p1? 1 : -1), str); kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
} else kputw(0, str); } else kputw(0, str);
kputc('\t', str); kputc('\t', str);
} else if (m && is_mapped(p)) { // then copy the position
kputsn("\t=\t", 3, str);
kputuw(pos - bns->anns[rid].offset + 1, str);
kputsn("\t0\t", 3, str);
} else kputsn("\t*\t0\t0\t", 7, str); } else kputsn("\t*\t0\t0\t", 7, str);
if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
kputsn("*\t*", 3, str); kputsn("*\t*", 3, str);
@ -760,7 +765,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
else if (h.qual > mapq0) h.qual = mapq0; else if (h.qual > mapq0) h.qual = mapq0;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
} }
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); } else {
bwahit_t h;
memset(&h, 0, sizeof(bwahit_t));
h.rb = h.re = -1; h.flag = extra_flag;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
}
s->sam = str.s; s->sam = str.s;
} }

View File

@ -260,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
// check if an end has multiple hits even after mate-SW // check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) { for (i = 0; i < 2; ++i) {
for (j = 1; j < a[i].n; ++j) for (j = 1; j < a[i].n; ++j)
if (a[i].a[j].secondary < 0) break; if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
is_multi[i] = j < a[i].n? 1 : 0; is_multi[i] = j < a[i].n? 1 : 0;
} }
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
@ -306,7 +306,11 @@ no_pairing:
if (a[i].n && a[i].a[0].score >= opt->T) { if (a[i].n && a[i].a[0].score >= opt->T) {
mem_alnreg2hit(&a[i].a[0], &h[i]); mem_alnreg2hit(&a[i].a[0], &h[i]);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
} else h[i].rb = h[i].re = -1; } else {
memset(&h[i], 0, sizeof(bwahit_t));
h[i].rb = h[i].re = -1;
h[i].flag = 1<<(6+i) | 1;
}
} }
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);

1
bwt.h
View File

@ -29,6 +29,7 @@
#define BWA_BWT_H #define BWA_BWT_H
#include <stdint.h> #include <stdint.h>
#include <stddef.h>
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
#define OCC_INTV_SHIFT 7 #define OCC_INTV_SHIFT 7

View File

@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq,
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j; lt = j;
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1;
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
if (score >= p->G) { if (score >= p->G) {
p->G = score; p->G = score;

2
main.c
View File

@ -3,7 +3,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.0-r341-beta" #define PACKAGE_VERSION "0.7.2-r351"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);