added bidirectional bwt; seems buggy

This commit is contained in:
Heng Li 2011-10-25 00:22:28 -04:00
parent 7b4266a6e5
commit 22c2252e15
6 changed files with 180 additions and 1 deletions

View File

@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \
is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \
bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o bamlite.o
bwtsw2_chain.o bamlite.o fastmap.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread

109
bwt.c
View File

@ -32,6 +32,7 @@
#include <stdint.h>
#include "utils.h"
#include "bwt.h"
#include "kvec.h"
void bwt_gen_cnt_table(bwt_t *bwt)
{
@ -237,3 +238,111 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t
*k0 = k; *l0 = l;
return l - k + 1;
}
/*********************
* Bidirectional BWT *
*********************/
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
{
bwtint_t tk[4], tl[4];
int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
for (i = 0; i != 4; ++i) {
ok[i].x[!is_back] = bwt->L2[i] + tk[i] + 1;
ok[i].x[2] = (tl[i] -= tk[i]);
}
ok[3].x[is_back] = ik->x[is_back];
ok[2].x[is_back] = ok[3].x[is_back] + tl[3];
ok[1].x[is_back] = ok[2].x[is_back] + tl[2];
ok[0].x[is_back] = ok[1].x[is_back] + tl[1];
}
static void bwt_reverse_intvs(bwtintv_v *p)
{
if (p->n > 1) {
int j;
for (j = 0; j < p->n>>1; ++j) {
bwtintv_t tmp = p->a[p->n - 1 - j];
p->a[p->n - 1 - j] = p->a[j];
p->a[j] = tmp;
}
}
}
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
bwtintv_v a[2], *prev, *curr, *swap;
mem->n = 0;
if (q[x] > 3) return x + 1;
kv_init(a[0]); kv_init(a[1]);
prev = tmpvec[0]? tmpvec[0] : &a[0];
curr = tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik);
ik.info = x + 1;
for (i = x + 1; i < len; ++i) { // forward search
if (q[i] > 3) break;
c = 3 - q[i];
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] == 0) break; // cannot be extended
ik = ok[c]; ik.info = i + 1;
}
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
ret = curr->a[0].info; // this will be the returned value
swap = curr; curr = prev; prev = swap;
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
if (q[i] > 3) break;
c = i < 0? 0 : q[i];
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
bwt_extend(bwt, p, ok, 1);
if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
if (curr->n == 0) { // curr->n to make sure there is no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
kv_push(bwtintv_t, *mem, ik);
}
} // otherwise the match is contained in another longer match
}
if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
ok[c].info = p->info;
kv_push(bwtintv_t, *curr, ok[c]);
}
}
if (curr->n == 0) break;
swap = curr; curr = prev; prev = swap;
}
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
if (tmpvec[0] == 0) free(a[0].a);
if (tmpvec[1] == 0) free(a[1].a);
return ret;
}
int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3])
{
int x = 0, i;
bwtintv_v a[3], *tvec[2], *mem1;
kv_init(a[0]); kv_init(a[1]); kv_init(a[2]); // no memory allocation here
tvec[0] = tmpvec[0]? tmpvec[0] : &a[0];
tvec[1] = tmpvec[1]? tmpvec[1] : &a[1];
mem1 = tmpvec[2]? tmpvec[2] : &a[2];
mem->n = 0;
do {
x = bwt_smem1(bwt, len, q, x, mem1, tvec);
for (i = 0; i < mem1->n; ++i)
kv_push(bwtintv_t, *mem, mem1->a[i]);
} while (x < len);
if (tmpvec[0] == 0) free(a[0].a);
if (tmpvec[1] == 0) free(a[1].a);
if (tmpvec[2] == 0) free(a[2].a);
return mem->n;
}

11
bwt.h
View File

@ -54,6 +54,12 @@ typedef struct {
bwtint_t *sa;
} bwt_t;
typedef struct {
bwtint_t x[3], info;
} bwtintv_t;
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
/* For general OCC_INTERVAL, the following is correct:
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
@ -75,6 +81,8 @@ typedef struct {
(bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
: (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)], (ik).x[2] = (bwt)->L2[(int)(c)+1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)], (ik).info = 0)
#ifdef __cplusplus
extern "C" {
#endif
@ -104,6 +112,9 @@ extern "C" {
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]);
#ifdef __cplusplus
}
#endif

56
fastmap.c 100644
View File

@ -0,0 +1,56 @@
#include <zlib.h>
#include <unistd.h>
#include <stdio.h>
#include "bwt.h"
#include "kvec.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
extern unsigned char nst_nt4_table[256];
int main_fastmap(int argc, char *argv[])
{
int c, i;
kseq_t *seq;
gzFile fp;
bwt_t *bwt;
bwtintv_v a[3], mem, *tvec[3];
while ((c = getopt(argc, argv, "")) >= 0) {
switch (c) {
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "bwa fastmap <idxbase> <in.fq>\n");
return 1;
}
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the BWT
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
free(tmp);
}
for (i = 0; i < 3; ++i) {
kv_init(a[i]);
tvec[i] = &a[i];
}
kv_init(mem);
while (kseq_read(seq) >= 0) {
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec);
printf(">%s\t%ld\n", seq->name.s, mem.n);
for (i = 0; i < mem.n; ++i) {
bwtintv_t *p = &mem.a[i];
printf("%d\t%d\t%ld\n", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
}
puts("//");
}
free(mem.a);
for (i = 0; i < 3; ++i) free(a[i].a);
bwt_destroy(bwt);
kseq_destroy(seq);
gzclose(fp);
return 0;
}

1
main.c
View File

@ -54,6 +54,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) return main_fastmap(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;

2
main.h
View File

@ -21,6 +21,8 @@ extern "C" {
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
#ifdef __cplusplus
}
#endif