From 038af2a55132da0b052f8dd7299872d3bc83f0a7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 20 Oct 2014 17:00:31 -0400 Subject: [PATCH] r929: added simplified LAST-like seeding --- bwamem.c | 21 ++++++++++++++++++--- bwamem_extra.c | 2 +- bwt.c | 44 ++++++++++++++++++++++++++++++++------------ bwt.h | 4 ++-- main.c | 2 +- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index 08588d5..95478a6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -121,7 +121,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co // first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { - x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv); + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length @@ -131,7 +131,6 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co } else ++x; } // second pass: find MEMs inside a long SMEM - if (opt->max_mem_intv > 0) goto sort_intv; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; @@ -142,8 +141,24 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } + // third pass: LAST-like + if (opt->max_mem_intv > 0) { + x = 0; + while (x < len) { + if (seq[x] < 4) { + if (1) { + bwtintv_t m; + x = bwt_seed_strategy1(bwt, len, seq, x, opt->max_mem_intv, &m); + if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + } else { // for now, we never come to this block which is slower + x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + } else ++x; + } + } // sort -sort_intv: ks_introsort(mem_intv, a->mem.n, a->mem.a); } diff --git a/bwamem_extra.c b/bwamem_extra.c index 09faaa2..02e817a 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -65,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr) while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM return itr->matches; } diff --git a/bwt.c b/bwt.c index 833e37d..a97b60c 100644 --- a/bwt.c +++ b/bwt.c @@ -285,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } } - -int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +// NOTE: $max_intv is not currently used in BWA-MEM +int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -302,10 +302,12 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { // an A/C/G/T base + if (ik.x[2] < max_intv) { // an interval small enough + kv_push(bwtintv_t, *curr, ik); + break; + } else if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ik.x[2] < max_intv && (i - x > max_len || ik.x[2] == 1)) break; if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further @@ -325,13 +327,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; - int max_stop = 0; - bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] != p->x[2]) { // change of interval size - if (p->info - i - 1 > max_len && p->x[2] < max_intv) - max_stop = 1; - } - if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1); + if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -355,7 +352,30 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { - return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec); + return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); +} + +int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem) +{ + int i, c; + bwtintv_t ik, ok[4]; + + memset(mem, 0, sizeof(bwtintv_t)); + if (q[x] > 3) return x + 1; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base + for (i = x + 1; i < len; ++i) { // forward search + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] < max_intv) { + *mem = ok[c]; + mem->info = (uint64_t)x<<32 | (i + 1); + return i + 1; + } + ik = ok[c]; + } else return i + 1; + } + return len; } /************************* diff --git a/bwt.h b/bwt.h index 7ab49bc..afb84d2 100644 --- a/bwt.h +++ b/bwt.h @@ -119,9 +119,9 @@ extern "C" { * Return the end of the longest exact match starting from _x_. */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - // SMEM iterator interface + int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int max_intv, bwtintv_t *mem); #ifdef __cplusplus } diff --git a/main.c b/main.c index b103798..54f93fe 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r926-dirty" +#define PACKAGE_VERSION "0.7.10-r929-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);