From 6641788d38ef874d09f8a88a3eac3d9df5ea2aa3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 11:42:31 -0500 Subject: [PATCH] preparation for further changes --- bwt.c | 27 ++++++++++++++------------- bwt.h | 2 +- fastmap.c | 16 +++++++++------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/bwt.c b/bwt.c index 966b718..e46c125 100644 --- a/bwt.c +++ b/bwt.c @@ -277,7 +277,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -285,37 +285,38 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem mem->n = 0; if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); - prev = tmpvec[0]? tmpvec[0] : &a[0]; - curr = tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { - c = 3 - q[i]; + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] != ik.x[2]) // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - break; // cannot be extended; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (q[i] > 3) break; + if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any. c = i < 0? 0 : q[i]; for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further if (curr->n == 0) { // curr->n to make sure there is no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -333,7 +334,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } diff --git a/bwt.h b/bwt.h index 5823f82..1eeaceb 100644 --- a/bwt.h +++ b/bwt.h @@ -121,7 +121,7 @@ extern "C" { * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. * Return the end of the longest exact match starting from _x_. */ - int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 4d7a675..2c0a823 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,11 +13,11 @@ extern unsigned char nst_nt4_table[256]; typedef struct { const bwt_t *bwt; const uint8_t *query; - int start, len; + int start, len, min_intv; bwtintv_v *tmpvec[2], *matches; } smem_i; -smem_i *smem_iter_init(const bwt_t *bwt) +smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) { smem_i *iter; iter = calloc(1, sizeof(smem_i)); @@ -25,6 +25,7 @@ smem_i *smem_iter_init(const bwt_t *bwt) iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); iter->matches = calloc(1, sizeof(bwtintv_v)); + iter->min_intv = min_intv > 0? min_intv : 1; return iter; } @@ -49,13 +50,13 @@ int smem_next(smem_i *iter) if (iter->start >= iter->len || iter->start < 0) return -1; while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); + iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); return iter->start; } int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; kseq_t *seq; bwtint_t k; gzFile fp; @@ -63,15 +64,16 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *iter; - while ((c = getopt(argc, argv, "w:l:s")) >= 0) { + while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { case 's': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + case 'm': min_intv = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] \n", min_len, min_iwidth, min_intv); return 1; } @@ -86,7 +88,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt); + iter = smem_iter_init(bwt, min_intv); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) {