diff --git a/bwamem.c b/bwamem.c index e48f89b..a4c8944 100644 --- a/bwamem.c +++ b/bwamem.c @@ -154,7 +154,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn // fprintf(stderr, "\n"); // first pass: find all SMEMs - fprintf(fp1, "seq: %ld\n", dn++); + //fprintf(fp1, "seq: %ld\n", dn++); //fprintf(stderr, "seq: %ld\n", dn++); //dn ++; while (x < len) { @@ -162,7 +162,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - // x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); + //x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; @@ -170,9 +170,14 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #endif for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; - fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); // seed length + // if (slen < 16) ++n16; + // if (slen < 17) ++n17; + // if (slen < 18) ++n18; + // if (slen < 19) ++n19; + // ++nall; max_seed_len = fmax(max_seed_len, slen); if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); @@ -181,7 +186,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn } // second pass: find MEMs inside a long SMEM //if (max_seed_len == len) - goto collect_intv_end; + // goto collect_intv_end; old_n = a->mem.n; for (k = 0; k < old_n; ++k) { @@ -191,15 +196,29 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + // bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); #endif - for (i = 0; i < a->mem1.n; ++i) - if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + int slen = (uint32_t)p->info - (p->info >> 32); + if (slen < 16) ++n16; + if (slen < 17) ++n17; + if (slen < 18) ++n18; + if (slen < 19) ++n19; + ++nall; + if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info >> 32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } } + + // goto collect_intv_end; + // third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; @@ -207,17 +226,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn if (seq[x] < 4) { if (1) { bwtintv_t m; - x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); - if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); - } else { // for now, we never come to this block which is slower #ifdef SHOW_PERF int64_t tmp_time = realtime_msec(); #endif - x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); + x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); #ifdef SHOW_PERF tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_bwt_smem1a, tmp_time); #endif + if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); + } else { // for now, we never come to this block which is slower + x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } diff --git a/bwt.c b/bwt.c index be958b0..e53fe03 100644 --- a/bwt.c +++ b/bwt.c @@ -315,7 +315,7 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) { #ifdef SHOW_PERF -#if 1 +#if 0 int64_t tmp_time = realtime_msec(); #endif #endif @@ -331,7 +331,7 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; #ifdef SHOW_PERF -#if 1 +#if 0 tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_bwt_extend, tmp_time); #endif diff --git a/fastmap.c b/fastmap.c index b1c36da..503ca15 100644 --- a/fastmap.c +++ b/fastmap.c @@ -54,7 +54,7 @@ int64_t time_ksw_extend2 = 0, time_bwt_sa = 0, time_bwt_sa_read = 0; -int64_t dn = 0; +int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0; FILE *fp1; @@ -437,6 +437,11 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads); + fprintf(stderr, "all seed num: %ld\n", nall); + fprintf(stderr, "all seed num litter than 19: %ld\n", n19); + fprintf(stderr, "all seed num litter than 18: %ld\n", n18); + fprintf(stderr, "all seed num litter than 17: %ld\n", n17); + fprintf(stderr, "all seed num litter than 16: %ld\n", n16); fprintf(stderr, "\n"); fclose(fp1); diff --git a/fmt_idx.c b/fmt_idx.c index 739ee82..078bc44 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -343,7 +343,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) } // 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ -void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) +inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) { uint32_t x = 0; uint32_t *p, *q, tmp; @@ -616,7 +616,7 @@ inline static uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int qbit |= q[i] << ((HASH_KMER_LEN - 1 - j) << 1); } *base_consumed = start_pos - i; - return qbit; + return (~qbit) & ((1 << (HASH_KMER_LEN << 1)) - 1); } // 当x为0,或者q[x-1]为N时,只需要前向搜索即可 @@ -645,7 +645,7 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, bwti // 继续向前扩展 for (i = x + kmer_len; i + 1 < len; i += 2) { - //if (ik.x[2] < 5) + //if (ik.x[2] < 2) // goto fmt_smem_forward_end; if (q[i] < 4 && q[i + 1] < 4) // 两个都可以扩展 { @@ -692,6 +692,7 @@ fmt_smem_forward_end: #endif #endif return ret; + //return len; } // 找smem(seed) @@ -699,13 +700,12 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv { int i, j, ret, kmer_len; bwtintv_t ik, ok1, ok2; - // bwtintv_t tik, tok1, tok2; bwtintv_v a[1], *curr; uint32_t qbit = 0; mem->n = 0; if (q[x] > 3) return x + 1; - //if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展 + // if (x == 0 || q[x-1] > 3) return fmt_smem_forward(fmt, len, q, x, mem); // 只用向前扩展 if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); @@ -715,19 +715,14 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置 ik.info = x + 1; - //fmt_set_intv(fmt, q[x], tik); - //tik.info = x + 1; - // check change of the interval size and whether the interval size is too small to be extended further #define CHECK_INTV_CHANGE(iv, ov, end_pos) \ - if (ov.x[2] != iv.x[2]) { kv_push(bwtintv_t, *curr, iv); if (ov.x[2] < min_intv) break; } iv = ov; iv.info = end_pos + if (ov.x[2] != iv.x[2]) { kv_push(bwtintv_t, *curr, iv); if (ov.x[2] < min_intv) break; } iv = ov; iv.info = end_pos #define PUSH_VAL_AND_SKIP(iv) \ do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0) // 处理kmer对应的匹配信息 for (j = 1, curr->n = 0; j < kmer_len; ++j) { - //fmt_extend1(fmt, &tik, &tok1, 0, 3 - q[x + j]); - //tik = tok1; fmt_kmer_get(fmt, &ok1, qbit, j); CHECK_INTV_CHANGE(ik, ok1, x + j + 1); } @@ -771,69 +766,59 @@ backward_search: ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 // 按照种子进行遍历,反向扩展 -#define CHECK_PUT_MEM(ok, pos, intv) \ - if (ok.x[2] < min_intv) { \ - if (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32) { \ - (intv).info |= (uint64_t)(pos) << 32; \ - kv_push(bwtintv_t, *mem, intv); \ - } \ - break; } +#define CHECK_ADD_MEM(pos, intv, mem) \ + if (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32) { (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); } + +#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ + if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; } for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - //if (p->info - x < HASH_KMER_LEN) { - // // 创建反向kmer - // uint32_t qbit = build_backward_kmer(q, p->info - 1, &kmer_len); - // fmt_kmer_get(fmt, &ik, qbit, kmer_len - 1); - //} - // for (i = p->info - kmer_len; i > 0 i -= 2) - - for (i = x - 1; i > 0; i -= 2) + if (p->info - x < HASH_KMER_LEN) { + uint32_t qbit = build_backward_kmer(q, p->info - 1, &kmer_len); // 创建反向kmer + i = 1; + do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); + p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2]; + i = p->info - (kmer_len - i + 3); + } else { + i = x - 1; + } + for (; i > 0; i -= 2) { if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 { fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); - CHECK_PUT_MEM(ok1, i + 1, *p); + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; - CHECK_PUT_MEM(ok2, i, ok1); - ok2.info = p->info; - *p = ok2; + CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); + ok2.info = p->info; *p = ok2; } else if (q[i] < 4) // 只能扩展一个 { fmt_extend1(fmt, p, &ok1, 1, q[i]); - CHECK_PUT_MEM(ok1, i + 1, *p); - } else + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); + ok1.info = p->info; *p = ok1; + } + else { // 不能扩展 - if (mem->n == 0 || (i + 1) < mem->a[mem->n - 1].info >> 32) - { - p->info |= (uint64_t)(i + 1) << 32; - kv_push(bwtintv_t, *mem, *p); - } + CHECK_ADD_MEM(i + 1, *p, mem); goto fmt_smem_end; } } - if (i == 0) { // 扩展到了第一个碱基 + for (; i == 0; --i) + { // 扩展到了第一个碱基 if (q[i] < 4) { fmt_extend1(fmt, p, &ok1, 1, q[i]); - CHECK_PUT_MEM(ok1, i + 1, *p); + CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); + ok1.info = p->info; *p = ok1; } else { - if (mem->n == 0 || (i + 1) < mem->a[mem->n - 1].info >> 32) - { - p->info |= (uint64_t)(i + 1) << 32; - kv_push(bwtintv_t, *mem, *p); - } + CHECK_ADD_MEM(i + 1, *p, mem); goto fmt_smem_end; } - --i; } if (i == -1) { - if (mem->n == 0 || (i + 1) < mem->a[mem->n - 1].info >> 32) - { - p->info |= (uint64_t)(i + 1) << 32; - kv_push(bwtintv_t, *mem, *p); - } + CHECK_ADD_MEM(i + 1, *p, mem); goto fmt_smem_end; } } diff --git a/utils.h b/utils.h index 8a1e919..882fa6b 100644 --- a/utils.h +++ b/utils.h @@ -43,7 +43,7 @@ extern int64_t time_ksw_extend2, time_bwt_sa, time_bwt_sa_read; -extern int64_t dn; +extern int64_t dn, n16, n17, n18, n19, nall; extern FILE *fp1;