From 22c2252e156b089bae0b154d51b1db777b1be34a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 00:22:28 -0400 Subject: [PATCH] added bidirectional bwt; seems buggy --- Makefile | 2 +- bwt.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwt.h | 11 ++++++ fastmap.c | 56 ++++++++++++++++++++++++++++ main.c | 1 + main.h | 2 + 6 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 fastmap.c diff --git a/Makefile b/Makefile index c9588f2..2035050 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o + bwtsw2_chain.o bamlite.o fastmap.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwt.c b/bwt.c index 038a8a3..e7da788 100644 --- a/bwt.c +++ b/bwt.c @@ -32,6 +32,7 @@ #include #include "utils.h" #include "bwt.h" +#include "kvec.h" void bwt_gen_cnt_table(bwt_t *bwt) { @@ -237,3 +238,111 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0 = k; *l0 = l; return l - k + 1; } + +/********************* + * Bidirectional BWT * + *********************/ + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) { + ok[i].x[!is_back] = bwt->L2[i] + tk[i] + 1; + ok[i].x[2] = (tl[i] -= tk[i]); + } + ok[3].x[is_back] = ik->x[is_back]; + ok[2].x[is_back] = ok[3].x[is_back] + tl[3]; + ok[1].x[is_back] = ok[2].x[is_back] + tl[2]; + ok[0].x[is_back] = ok[1].x[is_back] + tl[1]; +} + +static void bwt_reverse_intvs(bwtintv_v *p) +{ + if (p->n > 1) { + int j; + for (j = 0; j < p->n>>1; ++j) { + bwtintv_t tmp = p->a[p->n - 1 - j]; + p->a[p->n - 1 - j] = p->a[j]; + p->a[j] = tmp; + } + } +} + +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + int i, j, c, ret; + bwtintv_t ik, ok[4]; + bwtintv_v a[2], *prev, *curr, *swap; + + mem->n = 0; + if (q[x] > 3) return x + 1; + kv_init(a[0]); kv_init(a[1]); + prev = tmpvec[0]? tmpvec[0] : &a[0]; + curr = tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); + + ik.info = x + 1; + for (i = x + 1; i < len; ++i) { // forward search + if (q[i] > 3) break; + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] == 0) break; // cannot be extended + ik = ok[c]; ik.info = i + 1; + } + if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end + bwt_reverse_intvs(curr); // s.t. smaller intervals visited first + ret = curr->a[0].info; // this will be the returned value + swap = curr; curr = prev; prev = swap; + + for (i = x - 1; i >= -1; --i) { // backward search for MEMs + if (q[i] > 3) break; + c = i < 0? 0 : q[i]; + for (j = 0, curr->n = 0; j < prev->n; ++j) { + bwtintv_t *p = &prev->a[j]; + bwt_extend(bwt, p, ok, 1); + if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches + ik = *p; ik.info |= (uint64_t)(i + 1)<<32; + kv_push(bwtintv_t, *mem, ik); + } + } // otherwise the match is contained in another longer match + } + if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + ok[c].info = p->info; + kv_push(bwtintv_t, *curr, ok[c]); + } + } + if (curr->n == 0) break; + swap = curr; curr = prev; prev = swap; + } + bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate + + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + return ret; +} + +int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]) +{ + int x = 0, i; + bwtintv_v a[3], *tvec[2], *mem1; + kv_init(a[0]); kv_init(a[1]); kv_init(a[2]); // no memory allocation here + tvec[0] = tmpvec[0]? tmpvec[0] : &a[0]; + tvec[1] = tmpvec[1]? tmpvec[1] : &a[1]; + mem1 = tmpvec[2]? tmpvec[2] : &a[2]; + mem->n = 0; + do { + x = bwt_smem1(bwt, len, q, x, mem1, tvec); + for (i = 0; i < mem1->n; ++i) + kv_push(bwtintv_t, *mem, mem1->a[i]); + } while (x < len); + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec[2] == 0) free(a[2].a); + return mem->n; +} diff --git a/bwt.h b/bwt.h index 2f12be6..e635b13 100644 --- a/bwt.h +++ b/bwt.h @@ -54,6 +54,12 @@ typedef struct { bwtint_t *sa; } bwt_t; +typedef struct { + bwtint_t x[3], info; +} bwtintv_t; + +typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; + /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -75,6 +81,8 @@ typedef struct { (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)], (ik).x[2] = (bwt)->L2[(int)(c)+1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)], (ik).info = 0) + #ifdef __cplusplus extern "C" { #endif @@ -104,6 +112,9 @@ extern "C" { int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); + int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c new file mode 100644 index 0000000..ee8e9b4 --- /dev/null +++ b/fastmap.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include "bwt.h" +#include "kvec.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char nst_nt4_table[256]; + +int main_fastmap(int argc, char *argv[]) +{ + int c, i; + kseq_t *seq; + gzFile fp; + bwt_t *bwt; + bwtintv_v a[3], mem, *tvec[3]; + while ((c = getopt(argc, argv, "")) >= 0) { + switch (c) { + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "bwa fastmap \n"); + return 1; + } + fp = gzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + { // load the BWT + char *tmp = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(tmp, argv[optind]), ".bwt"); + bwt = bwt_restore_bwt(tmp); + free(tmp); + } + for (i = 0; i < 3; ++i) { + kv_init(a[i]); + tvec[i] = &a[i]; + } + kv_init(mem); + while (kseq_read(seq) >= 0) { + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); + printf(">%s\t%ld\n", seq->name.s, mem.n); + for (i = 0; i < mem.n; ++i) { + bwtintv_t *p = &mem.a[i]; + printf("%d\t%d\t%ld\n", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + } + puts("//"); + } + free(mem.a); + for (i = 0; i < 3; ++i) free(a[i].a); + bwt_destroy(bwt); + kseq_destroy(seq); + gzclose(fp); + return 0; +} diff --git a/main.c b/main.c index 8d663b6..11de84e 100644 --- a/main.c +++ b/main.c @@ -54,6 +54,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) return main_fastmap(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h index 15ec189..026a80b 100644 --- a/main.h +++ b/main.h @@ -21,6 +21,8 @@ extern "C" { int bwa_bwtsw2(int argc, char *argv[]); + int main_fastmap(int argc, char *argv[]); + #ifdef __cplusplus } #endif