Merge branch 'master' into master_fixes

Conflicts:
	Makefile
This commit is contained in:
Rob Davies 2013-03-11 13:50:49 +00:00
commit 9228e48efd
9 changed files with 81 additions and 23 deletions

View File

@ -1,8 +1,7 @@
CC= gcc
CFLAGS= -g -Wall -O2 -msse2
CXXFLAGS= $(CFLAGS)
CFLAGS= -g -Wall -O2
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
DFLAGS= -DHAVE_PTHREAD
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
@ -17,8 +16,6 @@ SUBDIRS= .
.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
.cc.o:
$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
all:$(PROG)

60
NEWS
View File

@ -1,3 +1,49 @@
Release 0.7.2 (9 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition,
flagging `properly paired' also gets improved a little.
(0.7.2: 9 March 2013, r351)
Release 0.7.1 (8 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes to BWA-MEM:
* Bugfix: rare segmentation fault caused by a partial hit to the end of the
last sequence.
* Bugfix: occasional mis-pairing given an interleaved fastq.
* Bugfix: wrong mate information when the mate is unmapped. SAM generated by
BWA-MEM can now be validated with Picard.
* Improved the performance and accuracy for ultra-long query sequences.
Short-read alignment is not affected.
Changes to other components:
* In BWA-backtrack and BWA-SW, replaced the code for global alignment,
Smith-Waterman and SW extension. The performance and accuracy of the two
algorithms stay the same.
* Added an experimental subcommand to merge overlapping paired ends. The
algorithm is very conservative: it may miss true overlaps but rarely makes
mistakes.
An important note is that like BWA-SW, BWA-MEM may output multiple primary
alignments for a read, which may cause problems to some tools. For aligning
sequence reads, it is advised to use `-M' to flag extra hits as secondary. This
option is not the default because multiple primary alignments are theoretically
possible in sequence alignment.
(0.7.1: 8 March 2013, r347)
Beta Release 0.7.0 (28 Feburary, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -16,14 +62,14 @@ In addition to the algorithmic improvements, BWA-MEM also implements a few
handy features in practical aspects:
1. BWA-MEM automatically switches between local and glocal (global wrt reads;
local wrt reference) alignment. It reports the end-to-end glocal alignment
if the glocal alignment is not much worse than the optimal local alignment.
Glocal alignment reduces reference bias.
local wrt reference) alignment. It reports the end-to-end glocal alignment
if the glocal alignment is not much worse than the optimal local alignment.
Glocal alignment reduces reference bias.
2. BWA-MEM automatically infers pair orientation from a batch of single-end
alignments. It allows more than one orientations if there are sufficient
supporting reads. This feature has not been tested on reads from Illumina
jumping library yet. (EXPERIMENTAL)
supporting reads. This feature has not been tested on reads from Illumina
jumping library yet. (EXPERIMENTAL)
3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
is possible to convert a name-sorted BAM to an interleaved fastq on the fly
@ -37,8 +83,8 @@ handy features in practical aspects:
files without replying on bash features.
6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
program in the source code directory implements a full single-end mapper in
50 lines of code.
program in the source code directory implements a full single-end mapper in
50 lines of code.
The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
for production use yet. However, when the implementation becomes stable after a

4
bwa.1
View File

@ -1,4 +1,4 @@
.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools"
.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
@ -235,7 +235,7 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
.BI -T \ INT
Don't output alignment with score lower than
.IR INT .
This option only affects output. [30]
This option affects output and occasionally SAM flag 2. [30]
.TP
.B -a
Output all found alignments for single-end or unpaired paired-end reads. These

4
bwa.c
View File

@ -75,9 +75,9 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25])
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base
mat[k++] = -1; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
for (j = 0; j < 5; ++j) mat[k++] = -1;
}
// Generate CIGAR when the alignment end points are known

View File

@ -612,7 +612,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
uint32_t *cigar = 0;
int64_t pos;
int64_t pos = -1;
bwahit_t ptmp, *p = &ptmp;
if (!p_) { // in this case, generate an unmapped alignment
@ -628,6 +628,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
}
p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read
kputs(s->name, str); kputc('\t', str);
if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
@ -670,9 +671,13 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
if (mid == rid) {
int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
kputw(p0 - p1 + (p0 > p1? 1 : -1), str);
kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
} else kputw(0, str);
kputc('\t', str);
} else if (m && is_mapped(p)) { // then copy the position
kputsn("\t=\t", 3, str);
kputuw(pos - bns->anns[rid].offset + 1, str);
kputsn("\t0\t", 3, str);
} else kputsn("\t*\t0\t0\t", 7, str);
if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
kputsn("*\t*", 3, str);
@ -760,7 +765,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
else if (h.qual > mapq0) h.qual = mapq0;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
}
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
} else {
bwahit_t h;
memset(&h, 0, sizeof(bwahit_t));
h.rb = h.re = -1; h.flag = extra_flag;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
}
s->sam = str.s;
}

View File

@ -260,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
for (j = 1; j < a[i].n; ++j)
if (a[i].a[j].secondary < 0) break;
if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
is_multi[i] = j < a[i].n? 1 : 0;
}
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
@ -306,7 +306,11 @@ no_pairing:
if (a[i].n && a[i].a[0].score >= opt->T) {
mem_alnreg2hit(&a[i].a[0], &h[i]);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
} else h[i].rb = h[i].re = -1;
} else {
memset(&h[i], 0, sizeof(bwahit_t));
h[i].rb = h[i].re = -1;
h[i].flag = 1<<(6+i) | 1;
}
}
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);

1
bwt.h
View File

@ -29,6 +29,7 @@
#define BWA_BWT_H
#include <stdint.h>
#include <stddef.h>
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
#define OCC_INTV_SHIFT 7

View File

@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq,
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0);
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1;
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
if (score >= p->G) {
p->G = score;

2
main.c
View File

@ -3,7 +3,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.0-r341-beta"
#define PACKAGE_VERSION "0.7.2-r351"
#endif
int bwa_fa2pac(int argc, char *argv[]);