diff --git a/bwt.c b/bwt.c index 689d8f8..fe8007f 100644 --- a/bwt.c +++ b/bwt.c @@ -337,3 +337,44 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/*************************** + * SMEM iterator interface * + ***************************/ + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); + free(itr->tmpvec[1]->a); + free(itr->matches->a); + free(itr); +} + +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; + itr->min_intv = min_intv; +} + +int smem_next(smem_i *itr) +{ + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; + if (itr->start >= itr->len || itr->start < 0) return -1; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return -1; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + return itr->start; +} diff --git a/bwt.h b/bwt.h index 1eeaceb..67a256d 100644 --- a/bwt.h +++ b/bwt.h @@ -60,6 +60,13 @@ typedef struct { typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len, min_intv; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -123,6 +130,13 @@ extern "C" { */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + // SMEM iterator interface + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); + int smem_next(smem_i *itr); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 2c0a823..17db06d 100644 --- a/fastmap.c +++ b/fastmap.c @@ -10,50 +10,6 @@ KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; -} smem_i; - -smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) -{ - smem_i *iter; - iter = calloc(1, sizeof(smem_i)); - iter->bwt = bwt; - iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - iter->matches = calloc(1, sizeof(bwtintv_v)); - iter->min_intv = min_intv > 0? min_intv : 1; - return iter; -} - -void smem_iter_destroy(smem_i *iter) -{ - free(iter->tmpvec[0]->a); - free(iter->tmpvec[1]->a); - free(iter->matches->a); - free(iter); -} - -void smem_set_query(smem_i *iter, int len, const uint8_t *query) -{ - iter->query = query; - iter->start = 0; - iter->len = len; -} - -int smem_next(smem_i *iter) -{ - iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; - if (iter->start >= iter->len || iter->start < 0) return -1; - while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases - if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); - return iter->start; -} - int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; @@ -62,7 +18,7 @@ int main_fastmap(int argc, char *argv[]) gzFile fp; bwt_t *bwt; bntseq_t *bns; - smem_i *iter; + smem_i *itr; while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { @@ -88,7 +44,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt, min_intv); + itr = smem_itr_init(bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -97,10 +53,10 @@ int main_fastmap(int argc, char *argv[]) } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(iter) > 0) { - for (i = 0; i < iter->matches->n; ++i) { - bwtintv_t *p = &iter->matches->a[i]; + smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(itr) > 0) { + for (i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { @@ -120,7 +76,7 @@ int main_fastmap(int argc, char *argv[]) puts("//"); } - smem_iter_destroy(iter); + smem_itr_destroy(itr); bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq);