Merge branch 'master' into master_fixes

Conflicts:
	Makefile
	bwape.c
	bwase.c
	bwtsw2_aux.c
	stdaln.c
This commit is contained in:
Rob Davies 2013-03-08 16:46:45 +00:00
commit aabd990e8f
21 changed files with 482 additions and 1461 deletions

View File

@ -4,8 +4,8 @@ CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o \
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa bwamem-lite
@ -23,10 +23,10 @@ SUBDIRS= .
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
@ -41,33 +41,33 @@ depend:
QSufSort.o: QSufSort.h
bamlite.o: utils.h bamlite.h
bntseq.o: bntseq.h main.h utils.h kseq.h
bntseq.o: bntseq.h utils.h kseq.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h
bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h kbtree.h
bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h
bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h
bwape.o: khash.h
bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h
bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h
bwape.o: bwtaln.h bwt.h kvec.h bntseq.h utils.h bwase.h bwa.h ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h kseq.h
bwt.o: utils.h bwt.h kvec.h
bwt_gen.o: QSufSort.h utils.h
bwt_lite.o: bwt_lite.h utils.h
bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h
bwtindex.o: bntseq.h bwt.h main.h utils.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h
bwtsw2_aux.o: bwa.h kseq.h ksort.h
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h utils.h
bwtindex.o: bntseq.h bwt.h utils.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwa.h
bwtsw2_aux.o: ksw.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h
bwtsw2_core.o: ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h utils.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h
is.o: utils.h
kopen.o: utils.h
kstring.o: kstring.h utils.h
ksw.o: ksw.h utils.h
main.o: main.h utils.h
stdaln.o: stdaln.h utils.h
main.o: utils.h
pemerge.o: ksw.h kseq.h utils.h kstring.h bwa.h bntseq.h bwt.h
utils.o: utils.h ksort.h kseq.h

View File

@ -32,7 +32,6 @@
#include <unistd.h>
#include <errno.h>
#include "bntseq.h"
#include "main.h"
#include "utils.h"
#include "kseq.h"

24
bwa.1
View File

@ -1,4 +1,4 @@
.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
@ -81,6 +81,8 @@ genome.
.IR minSeedLen ]
.RB [ -w
.IR bandWidth ]
.RB [ -d
.IR zDropoff ]
.RB [ -r
.IR seedSplitRatio ]
.RB [ -c
@ -163,6 +165,21 @@ Band width. Essentially, gaps longer than
will not be found. Note that the maximum gap length is also affected by the
scoring matrix and the hit length, not solely determined by this option. [100]
.TP
.BI -d \ INT
Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between
the best and the current extension score is above
.RI | i - j |* A + INT ,
where
.I i
and
.I j
are the current positions of the query and reference, respectively, and
.I A
is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it
doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not
only avoids unnecessary extension, but also reduces poor alignments inside a
long good alignment. [100]
.TP
.BI -r \ FLOAT
Trigger re-seeding for a MEM longer than
.IR minSeedLen * FLOAT .
@ -215,6 +232,11 @@ and will be converted to a TAB in the output SAM. The read group ID will be
attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
[null]
.TP
.BI -T \ INT
Don't output alignment with score lower than
.IR INT .
This option only affects output. [30]
.TP
.B -a
Output all found alignments for single-end or unpaired paired-end reads. These
alignments will be flagged as secondary alignments.

13
bwa.c
View File

@ -55,7 +55,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
kseq2bseq1(ks2, &seqs[n]);
size += seqs[n++].l_seq;
}
if (size >= chunk_size) break;
if (size >= chunk_size && (n&1) == 0) break;
}
if (size == 0) { // test if the 2nd file is finished
if (ks2 && kseq_read(ks2) >= 0)
@ -69,6 +69,17 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
* CIGAR related *
*****************/
void bwa_fill_scmat(int a, int b, int8_t mat[25])
{
int i, j, k;
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
}
// Generate CIGAR when the alignment end points are known
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{

1
bwa.h
View File

@ -30,6 +30,7 @@ extern "C" {
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);

View File

@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init()
o = xcalloc(1, sizeof(mem_opt_t));
o->flag = 0;
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
o->T = 30;
o->zdrop = 100;
o->pen_unpaired = 9;
o->pen_clip = 5;
@ -58,21 +59,10 @@ mem_opt_t *mem_opt_init()
o->chunk_size = 10000000;
o->n_threads = 1;
o->max_matesw = 100;
mem_fill_scmat(o->a, o->b, o->mat);
bwa_fill_scmat(o->a, o->b, o->mat);
return o;
}
void mem_fill_scmat(int a, int b, int8_t mat[25])
{
int i, j, k;
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
}
/***************************
* SMEM iterator interface *
***************************/
@ -753,11 +743,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b
int k;
kstring_t str;
str.l = str.m = 0; str.s = 0;
if (a->n > 0) {
if (a->n > 0 && a->a[0].score >= opt->T) {
int mapq0 = -1;
for (k = 0; k < a->n; ++k) {
bwahit_t h;
mem_alnreg_t *p = &a->a[k];
if (p->score < opt->T) continue;
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
mem_alnreg2hit(p, &h);

View File

@ -24,6 +24,7 @@ typedef struct {
int w; // band width
int zdrop; // Z-dropoff
int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor

View File

@ -303,7 +303,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
no_pairing:
for (i = 0; i < 2; ++i) {
if (a[i].n) {
if (a[i].n && a[i].a[0].score >= opt->T) {
mem_alnreg2hit(&a[i].a[0], &h[i]);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
} else h[i].rb = h[i].re = -1;

41
bwape.c
View File

@ -8,9 +8,9 @@
#include "kvec.h"
#include "bntseq.h"
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"
#include "bwa.h"
#include "ksw.h"
typedef struct {
int n;
@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
#define SW_MIN_MAPQ 17
// cnt = n_mm<<16 | n_gapo<<8 | n_gape
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen,
int *n_cigar, uint32_t *_cnt)
bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
{
kswr_t r;
uint32_t *cigar32 = 0;
bwa_cigar_t *cigar = 0;
ubyte_t *ref_seq;
bwtint_t k, x, y, l;
int path_len, ret, subo;
AlnParam ap = aln_param_bwa;
path_t *path, *p;
int xtra;
int8_t mat[25];
bwa_fill_scmat(1, 3, mat);
// check whether there are too many N's
if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
for (k = 0, x = 0; k < len; ++k)
@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
ref_seq = (ubyte_t*)xcalloc(reglen, 1);
for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
path = (path_t*)xcalloc(l+len, sizeof(path_t));
// do alignment
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo);
if (ret < 0 || subo == ret) { // no hit or tandem hits
free(path); free(cigar); free(ref_seq); *n_cigar = 0;
xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
cigar = (bwa_cigar_t*)cigar32;
for (k = 0; k < *n_cigar; ++k)
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits
free(cigar); free(ref_seq); *n_cigar = 0;
return 0;
}
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
// check whether the alignment is good enough
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
else y += __cigar_len(c);
}
if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
free(path); free(cigar); free(ref_seq);
free(cigar); free(ref_seq);
*n_cigar = 0;
return 0;
}
{ // update cigar and coordinate;
int start, end;
p = path + path_len - 1;
*beg += (p->i? p->i : 1) - 1;
start = (p->j? p->j : 1) - 1;
end = path->j;
int start = r.qb, end = r.qe + 1;
*beg += r.tb;
cigar = (bwa_cigar_t*)xrealloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
if (start) {
memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
{ // set *cnt
int n_mm, n_gapo, n_gape;
n_mm = n_gapo = n_gape = 0;
p = path + path_len - 1;
x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0;
x = r.tb; y = r.qb;
for (k = 0; k < *n_cigar; ++k) {
bwa_cigar_t c = cigar[k];
if (__cigar_op(c) == FROM_M) {
@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u
*_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
}
free(ref_seq); free(path);
free(ref_seq);
return cigar;
}

100
bwase.c
View File

@ -4,13 +4,13 @@
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "stdaln.h"
#include "bwase.h"
#include "bwtaln.h"
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
#include "bwa.h"
#include "ksw.h"
int g_log_n[256];
@ -156,57 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se
bwt_destroy(bwt);
}
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
* forward strand. This happens when p->pos is calculated by
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
* coordinate. This happens only for color-converted alignment. */
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
int ext, int *n_cigar, int is_end_correct)
#define SW_BW 50
bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev)
{
bwa_cigar_t *cigar = 0;
ubyte_t *ref_seq;
int l = 0, path_len, ref_len;
AlnParam ap = aln_param_bwa;
path_t *path;
int64_t k, __pos = *_pos;
uint32_t *cigar32 = 0;
ubyte_t *rseq;
int tle, qle, gtle, gscore;
int64_t k, rb, re, rlen;
int8_t mat[25];
ref_len = len + abs(ext);
if (ext > 0) {
ref_seq = (ubyte_t*)xcalloc(ref_len, 1);
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
} else {
int64_t x = __pos + (is_end_correct? len : ref_len);
ref_seq = (ubyte_t*)xcalloc(ref_len, 1);
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
}
path = (path_t*)xcalloc(l+len, sizeof(path_t));
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand
for (l = k = 0; k < *n_cigar; ++k) {
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
bwa_fill_scmat(1, 3, mat);
if (!is_rev) { // forward strand, the end position is correct
re = *_pos + len;
if (re > l_pac) re = l_pac;
rb = re - (len + SW_BW);
if (rb < 0) rb = 0;
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences
seq_reverse(rlen, rseq, 0);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
if (gscore > 0) tle = gtle, qle = len;
rb = re - tle; rlen = tle;
seq_reverse(len, seq, 0);
seq_reverse(rlen, rseq, 0);
ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32);
if (qle < len) { // write soft clip
cigar = xrealloc(cigar, (*n_cigar + 1) * 4);
memmove(cigar + 1, cigar, *n_cigar * 4);
cigar[0] = (len - qle)<<4 | FROM_S;
++(*n_cigar);
}
} else { // reverse strand, the start position is correct
rb = *_pos; re = rb + len + SW_BW;
if (re > l_pac) re = l_pac;
rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, &gtle, &gscore, 0);
if (gscore > 0) tle = gtle, qle = len;
re = rb + tle; rlen = tle;
ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension
if (qle < len) {
cigar = xrealloc(cigar, (*n_cigar + 1) * 4);
cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S;
++(*n_cigar);
}
__pos += l;
}
*_pos = rb;
if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end
__pos += __cigar_len(cigar[0]);
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
--(*n_cigar);
}
if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end
// change "I" at either end of the read to S. just in case. This should rarely happen...
if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1])));
if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0])));
*_pos = (bwtint_t)__pos;
free(ref_seq); free(path);
cigar = (bwa_cigar_t*)cigar32;
for (k = 0; k < *n_cigar; ++k)
cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
free(rseq);
return cigar;
}
@ -314,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
bwt_multi1_t *q = s->multi + j;
int n_cigar;
if (q->gap == 0) continue;
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos,
(q->strand? 1 : -1) * q->gap, &n_cigar, 1);
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand);
q->n_cigar = n_cigar;
}
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand);
}
// generate MD tag
str = (kstring_t*)xcalloc(1, sizeof(kstring_t));
@ -606,5 +605,6 @@ int bwa_sai2sam_se(int argc, char *argv[])
return 0;
}
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
free(prefix);
return 0;
}

View File

@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[])
free(opt); free(prefix);
return 0;
}
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
__cigar_op and __cigar_len while keeping stdaln stand alone */
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
{
uint32_t *cigar32;
bwa_cigar_t *cigar;
int i;
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
cigar = (bwa_cigar_t*)cigar32;
for (i = 0; i < *n_cigar; ++i)
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
return cigar;
}

View File

@ -28,6 +28,11 @@
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
#endif
#define FROM_M 0
#define FROM_I 1
#define FROM_D 2
#define FROM_S 3
typedef struct {
bwtint_t w;
int bid;
@ -138,13 +143,6 @@ extern "C" {
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
__cigar_op and __cigar_len while keeping stdaln stand alone */
#include "stdaln.h"
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
#ifdef __cplusplus
}
#endif

View File

@ -33,7 +33,6 @@
#include <zlib.h>
#include "bntseq.h"
#include "bwt.h"
#include "main.h"
#include "utils.h"
#ifdef _DIVBWT

View File

@ -11,9 +11,9 @@
#include "bwt_lite.h"
#include "utils.h"
#include "bwtsw2.h"
#include "stdaln.h"
#include "kstring.h"
#include "bwa.h"
#include "ksw.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
@ -94,13 +94,12 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i, matrix[25];
int i;
bwtint_t k;
uint8_t *target = 0, *query;
AlnParam par;
int8_t mat[25];
par.matrix = matrix;
__gen_ap(par, opt);
bwa_fill_scmat(opt->a, opt->b, mat);
query = xcalloc(lq, 1);
// sort according to the descending order of query end
ks_introsort(hit, b->n, b->hits);
@ -111,8 +110,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
int score, j;
path_t path;
int score, j, qle, tle;
p->n_seeds = 1;
if (p->l || p->k == 0) continue;
for (j = score = 0; j < i; ++j) {
@ -127,12 +125,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem);
score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0);
if (score > p->G) { // extensible
p->G = score;
p->len += path.i;
p->beg -= path.j;
p->k -= path.i;
p->k -= tle;
p->len += tle;
p->beg -= qle;
}
}
free(query); free(target);
@ -140,62 +138,49 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq
void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
{
int i, matrix[25];
int i;
bwtint_t k;
uint8_t *target;
AlnParam par;
par.matrix = matrix;
__gen_ap(par, opt);
int8_t mat[25];
bwa_fill_scmat(opt->a, opt->b, mat);
target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
int j, score;
path_t path;
int j, score, qle, tle;
if (p->l) continue;
for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
lt = j;
score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem);
score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0);
// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
if (score >= p->G) {
p->G = score;
p->len = path.i;
p->end = path.j + p->beg;
p->len = tle;
p->end = p->beg + qle;
}
}
free(target);
}
/* generate CIGAR array(s) in b->cigar[] */
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name)
{
uint8_t *target;
int i, matrix[25];
AlnParam par;
path_t *path;
int i;
int8_t mat[25];
par.matrix = matrix;
__gen_ap(par, opt);
i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length
target = xcalloc(i, 1);
path = xcalloc(i + lq, sizeof(path_t));
// generate CIGAR
bwa_fill_scmat(opt->a, opt->b, mat);
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = b->hits + i;
bsw2aux_t *q = b->aux + i;
uint8_t *query;
bwtint_t k;
int path_len, beg, end;
int beg, end, score;
if (p->l) continue;
beg = (p->flag & 0x10)? lq - p->end : p->beg;
end = (p->flag & 0x10)? lq - p->beg : p->end;
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm);
#if 0
if (name && score != p->G) { // debugging only
int j, glen = 0;
@ -206,7 +191,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
__func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw);
}
#endif
if (beg != 0 || end < lq) { // write soft clipping
if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping
q->cigar = xrealloc(q->cigar, 4 * (q->n_cigar + 2));
if (beg != 0) {
memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
@ -219,7 +204,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
}
}
}
free(target); free(path);
}
/* this is for the debugging purpose only */
@ -407,27 +391,6 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c
return n_cigar;
}
static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq)
{
int k, x, n_mm = 0, i, n_gap = 0;
bwtint_t y;
x = 0; y = p->k;
for (k = 0; k < n_cigar; ++k) {
int op = cigar[k]&0xf;
int len = cigar[k]>>4;
if (op == 0) { // match
for (i = 0; i < len; ++i) {
int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3;
if (seq[x + i] != ref) ++n_mm;
}
x += len; y += len;
} else if (op == 1) x += len, n_gap += len;
else if (op == 2) y += len, n_gap += len;
else if (op == 4) x += len;
}
return n_mm + n_gap;
}
static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
{
int i;
@ -439,7 +402,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8
}
b->aux = xcalloc(b->n, sizeof(bsw2aux_t));
// generate CIGAR
gen_cigar(opt, qlen, seq, pac, b, name);
gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name);
// fix CIGAR, generate mapQ, and write chromosomal position
for (i = 0; i < b->n; ++i) {
bsw2hit_t *p = &b->hits[i];
@ -451,8 +414,6 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8
int subo;
// fix out-of-boundary CIGAR
q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
// compute the NM tag
q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]);
// compute mapQ
subo = p->G2 > opt->t? p->G2 : opt->t;
if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
@ -527,9 +488,10 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
bsw2aux_t *q = b->aux + i;
int j, beg, end, type = 0;
// print mandatory fields before SEQ
if (q->cigar == 0) q->flag |= 0x4;
ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0));
ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
if (p->l == 0) { // not a repetitive hit
if (p->l == 0 && q->cigar) { // not a repetitive hit
ksprintf(&str, "\t%d\t", q->pqual);
for (k = 0; k < q->n_cigar; ++k)
ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
@ -538,7 +500,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
// get the sequence begin and end
beg = 0; end = ks->l;
if (opt->hard_clip) {
if (opt->hard_clip && q->cigar) {
if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
}

View File

@ -27,13 +27,14 @@ int main_mem(int argc, char *argv[])
void *ko = 0, *ko2 = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) {
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
else if (c == 'B') opt->b = atoi(optarg);
else if (c == 'O') opt->q = atoi(optarg);
else if (c == 'E') opt->r = atoi(optarg);
else if (c == 'T') opt->T = atoi(optarg);
else if (c == 'L') opt->pen_clip = atoi(optarg);
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
@ -59,7 +60,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
@ -75,6 +76,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -H hard clipping\n");
@ -85,7 +87,7 @@ int main_mem(int argc, char *argv[])
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
bwa_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
bwa_print_sam_hdr(idx->bns, rg_line);
@ -105,6 +107,11 @@ int main_mem(int argc, char *argv[])
}
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
int64_t size = 0;
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
n = n>>1<<1;
}
if (!copy_comment)
for (i = 0; i < n; ++i) {
free(seqs[i].comment); seqs[i].comment = 0;

2
ksw.c
View File

@ -427,7 +427,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff
if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);

27
main.c
View File

@ -1,12 +1,29 @@
#include <stdio.h>
#include <string.h>
#include "main.h"
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.0-r324-beta"
#define PACKAGE_VERSION "0.7.0-r341-beta"
#endif
int bwa_fa2pac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
int bwa_aln(int argc, char *argv[]);
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
static int usage()
{
fprintf(stderr, "\n");
@ -15,12 +32,13 @@ static int usage()
fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
@ -56,6 +74,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;

28
main.h
View File

@ -1,28 +0,0 @@
#ifndef BWA_MAIN_H
#define BWA_MAIN_H
#ifdef __cplusplus
extern "C" {
#endif
int bwa_fa2pac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwa_aln(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
#ifdef __cplusplus
}
#endif
#endif

286
pemerge.c 100644
View File

@ -0,0 +1,286 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <zlib.h>
#include <pthread.h>
#include <errno.h>
#include "ksw.h"
#include "kseq.h"
#include "kstring.h"
#include "bwa.h"
#include "utils.h"
KSEQ_DECLARE(gzFile)
#define MAX_SCORE_RATIO 0.9f
#define MAX_ERR 8
static const char *err_msg[MAX_ERR+1] = {
"successful merges",
"low-scoring pairs",
"pairs where the best SW alignment is not an overlap (long left end)",
"pairs where the best SW alignment is not an overlap (long right end)",
"pairs with large 2nd best SW score",
"pairs with gapped overlap",
"pairs where the end-to-end alignment is inconsistent with SW",
"pairs potentially with tandem overlaps",
"pairs with high sum of errors"
};
typedef struct {
int a, b, q, r, w;
int q_def, q_thres;
int T;
int chunk_size;
int n_threads;
int flag; // bit 1: print merged; 2: print unmerged
int8_t mat[25];
} pem_opt_t;
pem_opt_t *pem_opt_init()
{
pem_opt_t *opt;
opt = xcalloc(1, sizeof(pem_opt_t));
opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
opt->T = opt->a * 10;
opt->q_def = 20;
opt->q_thres = 70;
opt->chunk_size = 10000000;
opt->n_threads = 1;
opt->flag = 3;
bwa_fill_scmat(opt->a, opt->b, opt->mat);
return opt;
}
int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2])
{
uint8_t *s[2], *q[2], *seq, *qual;
int i, xtra, l, l_seq, sum_q, ret = 0;
kswr_t r;
s[0] = xmalloc(x[0].l_seq); q[0] = xmalloc(x[0].l_seq);
s[1] = xmalloc(x[1].l_seq); q[1] = xmalloc(x[1].l_seq);
for (i = 0; i < x[0].l_seq; ++i) {
int c = x[0].seq[i];
s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def;
}
for (i = 0; i < x[1].l_seq; ++i) {
int c = x[1].seq[x[1].l_seq - 1 - i];
c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
s[1][i] = c < 4? 3 - c : 4;
q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def;
}
xtra = KSW_XSTART | KSW_XSUBO;
r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
++r.qe; ++r.te; // change to the half-close-half-open coordinates
if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment
if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end
if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end
if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large
if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps
{ // test tandem match; O(n^2)
int max_m, max_m2, min_l, max_l, max_l2;
max_m = max_m2 = 0; max_l = max_l2 = 0;
min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq;
for (l = 1; l < min_l; ++l) {
int m = 0, o = x[0].l_seq - l;
uint8_t *s0o = &s[0][o], *s1 = s[1];
for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
else if (m > max_m2) max_m2 = m, max_l2 = l;
}
if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; }
if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) {
ret = -7; goto pem_ret;
}
if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; }
}
l = x[0].l_seq - (r.tb - r.qb); // length to merge
l_seq = x[0].l_seq + x[1].l_seq - l;
seq = xmalloc(l_seq + 1);
qual = xmalloc(l_seq + 1);
memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l);
memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l);
for (i = 0, sum_q = 0; i < l; ++i) {
int k = x[0].l_seq - l + i;
if (s[0][k] == 4) { // ambiguous
seq[k] = s[1][i];
qual[k] = q[1][i];
} else if (s[1][i] == 4) { // do nothing
} else if (s[0][k] == s[1][i]) {
qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
} else { // s[0][k] != s[1][i] and neither is N
int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
sum_q += qq >= 3? qq<<1 : 1;
seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
qual[k] = abs((int)q[0][k] - (int)q[1][i]);
}
}
if (sum_q>>1 > opt->q_thres) { // too many mismatches
free(seq); free(qual);
ret = -8; goto pem_ret;
}
for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
seq[l_seq] = qual[l_seq] = 0;
free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment);
memset(&x[1], 0, sizeof(bseq1_t));
free(x[0].seq); free(x[0].qual);
x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual;
pem_ret:
free(s[0]); free(s[1]); free(q[0]); free(q[1]);
return ret;
}
static inline void print_bseq(const bseq1_t *s, int rn)
{
err_putchar(s->qual? '@' : '>');
err_fputs(s->name, stdout);
if (rn == 1 || rn == 2) {
err_putchar('/'); err_putchar('0' + rn); err_putchar('\n');
} else err_puts(" merged");
err_puts(s->seq);
if (s->qual) {
err_puts("+"); err_puts(s->qual);
}
}
typedef struct {
int n, start;
bseq1_t *seqs;
int64_t cnt[MAX_ERR+1];
const pem_opt_t *opt;
} worker_t;
void *worker(void *data)
{
worker_t *w = (worker_t*)data;
int i;
for (i = w->start; i < w->n>>1; i += w->opt->n_threads)
++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])];
return 0;
}
static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1])
{
int i, j, n = n_>>1<<1;
worker_t *w;
w = xcalloc(opt->n_threads, sizeof(worker_t));
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w[i];
p->start = i; p->n = n;
p->opt = opt;
p->seqs = seqs;
}
if (opt->n_threads == 1) {
worker(w);
} else {
pthread_t *tid;
tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t));
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]);
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
free(tid);
}
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w[i];
for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j];
}
free(w);
for (i = 0; i < n>>1; ++i) {
if (seqs[i<<1|1].l_seq != 0) {
if (opt->flag&2) {
print_bseq(&seqs[i<<1|0], 1);
print_bseq(&seqs[i<<1|1], 2);
}
} else if (opt->flag&1)
print_bseq(&seqs[i<<1|0], 0);
}
for (i = 0; i < n; ++i) {
bseq1_t *s = &seqs[i];
free(s->name); free(s->seq); free(s->qual); free(s->comment);
}
}
int main_pemerge(int argc, char *argv[])
{
int c, flag = 0, i, n, min_ovlp = 10;
int64_t cnt[MAX_ERR+1];
bseq1_t *bseq;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
pem_opt_t *opt;
opt = pem_opt_init();
while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
if (c == 'm') flag |= 1;
else if (c == 'u') flag |= 2;
else if (c == 'Q') opt->q_thres = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg);
else if (c == 'T') min_ovlp = atoi(optarg);
}
if (flag == 0) flag = 3;
opt->flag = flag;
opt->T = opt->a * min_ovlp;
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
fprintf(stderr, "Options: -m output merged reads only\n");
fprintf(stderr, " -u output unmerged reads only\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp);
fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
fprintf(stderr, "\n");
free(opt);
return 1;
}
fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
if (NULL == fp) {
fprintf(stderr, "Couldn't open %s : %s\n",
strcmp(argv[optind], "-") ? argv[optind] : "stdin",
errno ? strerror(errno) : "Out of memory");
exit(EXIT_FAILURE);
}
ks = kseq_init(fp);
if (optind + 1 < argc) {
fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
if (NULL == fp) {
fprintf(stderr, "Couldn't open %s : %s\n",
strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin",
errno ? strerror(errno) : "Out of memory");
exit(EXIT_FAILURE);
}
ks2 = kseq_init(fp2);
}
memset(cnt, 0, 8 * (MAX_ERR+1));
while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
process_seqs(opt, n, bseq, cnt);
free(bseq);
}
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
for (i = 1; i <= MAX_ERR; ++i)
fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
kseq_destroy(ks);
err_gzclose(fp);
if (ks2) {
kseq_destroy(ks2);
err_gzclose(fp2);
}
free(opt);
err_fflush(stdout);
return 0;
}

1071
stdaln.c

File diff suppressed because it is too large Load Diff

162
stdaln.h
View File

@ -1,162 +0,0 @@
/* The MIT License
Copyright (c) 2003-2006, 2008, by Heng Li <lh3lh3@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
2009-07-23, 0.10.0
- Use 32-bit to store CIGAR
- Report suboptimal aligments
- Implemented half-fixed-half-open DP
2009-04-26, 0.9.10
- Allow to set a threshold for local alignment
2009-02-18, 0.9.9
- Fixed a bug when no residue matches
2008-08-04, 0.9.8
- Fixed the wrong declaration of aln_stdaln_aux()
- Avoid 0 coordinate for global alignment
2008-08-01, 0.9.7
- Change gap_end penalty to 5 in aln_param_bwa
- Add function to convert path_t to the CIGAR format
2008-08-01, 0.9.6
- The first gap now costs (gap_open+gap_ext), instead of
gap_open. Scoring systems are modified accordingly.
- Gap end is now correctly handled. Previously it is not correct.
- Change license to MIT.
*/
#ifndef LH3_STDALN_H_
#define LH3_STDALN_H_
#define STDALN_VERSION 0.11.0
#include <stdint.h>
#define FROM_M 0
#define FROM_I 1
#define FROM_D 2
#define FROM_S 3
#define ALN_TYPE_LOCAL 0
#define ALN_TYPE_GLOBAL 1
#define ALN_TYPE_EXTEND 2
/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */
#define MINOR_INF -1073741823
typedef struct
{
int gap_open;
int gap_ext;
int gap_end;
int *matrix;
int row;
int band_width;
} AlnParam;
typedef struct
{
int i, j;
unsigned char ctype;
} path_t;
typedef struct
{
path_t *path; /* for advanced users... :-) */
int path_len; /* for advanced users... :-) */
int start1, end1; /* start and end of the first sequence, coordinations are 1-based */
int start2, end2; /* start and end of the second sequence, coordinations are 1-based */
int score, subo; /* score */
char *out1, *out2; /* print them, and then you will know */
char *outm;
int n_cigar;
uint32_t *cigar32;
} AlnAln;
#ifdef __cplusplus
extern "C" {
#endif
AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap,
int type, int do_align, int len1, int len2);
AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align);
void aln_free_AlnAln(AlnAln *aa);
int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len);
int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int _thres, int *_subo);
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int G0, uint8_t *_mem);
uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar);
uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar);
#ifdef __cplusplus
}
#endif
/********************
* global variables *
********************/
extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */
extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */
extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */
extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */
extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */
/* common nucleotide score matrix for 16 bases */
extern int aln_sm_nt[], aln_sm_bwa[];
/* BLOSUM62 and BLOSUM45 */
extern int aln_sm_blosum62[], aln_sm_blosum45[];
/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */
extern int aln_sm_read[];
/* human-mouse score matrix for 4 bases */
extern int aln_sm_hs[];
#endif