From 555e39df9478ad5abd65b9ce918700404cd07eee Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 29 Jan 2024 23:14:46 +0800 Subject: [PATCH] =?UTF-8?q?=E9=AA=8C=E8=AF=81=E4=BA=86fmt-index=E7=9A=84?= =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E6=80=A7=EF=BC=8C=E7=94=A8=E4=B8=A4=E7=A7=8D?= =?UTF-8?q?=E6=96=B9=E5=BC=8F=E8=BF=9B=E8=A1=8C=E4=BA=86=E5=AE=9E=E7=8E=B0?= =?UTF-8?q?=EF=BC=8C=E7=AC=AC=E4=BA=8C=E7=A7=8D=E8=AE=B0=E5=BD=95b1?= =?UTF-8?q?=E5=92=8Cb2=E7=9A=84occ=E4=BB=A5=E5=8F=8A=E5=A4=A7=E4=BA=8E?= =?UTF-8?q?=E4=BB=96=E4=BB=AC=E7=9A=84=E7=A2=B1=E5=9F=BA=E7=9A=84occ?= =?UTF-8?q?=EF=BC=8C=E6=9B=B4=E5=BF=AB=E4=B8=80=E4=BA=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 2 +- Makefile | 2 +- bwt.cpp | 66 ++++++++++---- fmt_index.cpp | 211 +++++++++++++++++++++++++++++++++++++------- fmt_index.h | 4 + 5 files changed, 234 insertions(+), 51 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 59cb991..279cc62 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "${workspaceRoot}/fmtidx", "args": [ - "/home/zzh/data/reference/human_g1k_v37_decoy.fasta.bwt" + "/mnt/d/data/reference/human_g1k_v37_decoy.fasta" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 } diff --git a/Makefile b/Makefile index 27f3eef..ce158fc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC= g++ NOWARN= -Wno-unused-result -Wno-unused-function CFLAGS= #-g -Wall $(NOWARN) #-O2 -CPPFLAGS= -g -Wall $(NOWARN) -O2 +CPPFLAGS= -g -Wall $(NOWARN) -O3 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF AR= ar diff --git a/bwt.cpp b/bwt.cpp index 73d39aa..e23d666 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -150,9 +150,11 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) } k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); + // cout << "occ: " << p[0] << ' ' << p[1] << ' ' << p[2] << ' ' << p[3] << endl; memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) end = p + ((k >> 4) - ((k & ~OCC_INTV_MASK) >> 4)); // this is the end point of the following loop + //end = p + 1; for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); tmp = *p & ~((1U << ((~k & 15) << 1)) - 1); @@ -167,6 +169,9 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { // double tm_t = realtime(); + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + return; bwtint_t _k, _l; _k = k - (k >= bwt->primary); _l = l - (l >= bwt->primary); @@ -235,13 +240,13 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b void bwt_search(bwt_t *bwt, const string &q) { bwtintv_t ik, ok[4]; - int i, j, c, x = 0; + int i, c, x = 0; bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - for (i = x + 1; i < q.size(); ++i) + for (i = x + 1; i < (int)q.size(); ++i) { if (bval(q[i]) < 4) { @@ -255,48 +260,71 @@ void bwt_search(bwt_t *bwt, const string &q) } // 扩展两次 -void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1) +void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2) { bwtint_t tk[4], tl[4]; + bwtint_t interval = 0; int i; bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + //for (i = 3; i > c1; --i) + //{ + // interval += tl[i] - tk[i]; + //} + //cout << "interval-1: " << interval << endl; + // 这里是反向扩展 for (i = 0; i != 4; ++i) { ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 ok[i].x[2] = tl[i] - tk[i]; // 间隔 } + // ok[c1].x[!is_back] = bwt->L2[c1] + 1 + tk[c1]; // 起始行位置,互补链 + // ok[c1].x[2] = tl[c1] - tk[c1]; // 间隔 + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; + //cout << (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary) << endl; + *ik = ok[c1]; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + //cout << (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary) << ' ' << ok[c1].x[is_back] << endl; + //interval = 0; + //for (i = 3; i > c2; --i) + //{ + // interval += tl[i] - tk[i]; + //} + //cout << "interval-2: " << interval << endl; + + for (i = 0; i != 4; ++i) + { + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 + ok[i].x[2] = tl[i] - tk[i]; // 间隔 + } + + // cout << "occ: " << ok[c2].x[2] << endl; + + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; - *ik = ok[c1]; - bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); - for (i = 0; i != 4; ++i) - { - ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; // 起始行位置,互补链 - ok[i].x[2] = tl[i] - tk[i]; // 间隔 - } - // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 - ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); - ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; - ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; - ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; + } // 每次扩展两步 bwtintv_t bwt_search2(bwt_t *bwt, const string &q) { bwtintv_t ik, ok[4]; - int i, j, c1, c2, x = 0; + int i, c1, c2, x = 0; bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - for (i = x + 1; i + 1 < q.size(); i += 2) + for (i = x + 1; i + 1 < (int)q.size(); i += 2) { if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) { @@ -305,7 +333,7 @@ bwtintv_t bwt_search2(bwt_t *bwt, const string &q) c2 = cbval(q[i + 1]); // double tm_t = realtime(); - bwt_extend2(bwt, &ik, ok, 0, c1); + bwt_extend2(bwt, &ik, ok, 0, c1, c2); // t7 += realtime() - tm_t; ik = ok[c2]; @@ -317,7 +345,7 @@ bwtintv_t bwt_search2(bwt_t *bwt, const string &q) break; } } - if (i < q.size() && bval(q[i]) < 4) + if (i < (int)q.size() && bval(q[i]) < 4) { // 最后一次扩展 c1 = cbval(q[i]); // double tm_t = realtime(); diff --git a/fmt_index.cpp b/fmt_index.cpp index 99837b5..147e5a1 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "util.h" #include "fmt_index.h" @@ -112,6 +113,33 @@ void fmt_gen_cnt_table(FMTIndex *fmt) } } +// 生成occ,每个字节对应一个pattern +void fmt_gen_cnt_occ(FMTIndex *fmt) +{ + // 0-8:大于a的occ,8-16:大于b的occ,16-24:b的occ + int i, a, b, ti; + uint32_t oa, ooa, ob, oob; + for (i = 0; i != 256; ++i) // 遍历单个字节的各种情况 + { + for (a = 0; a < 4; ++a) // ba格式 + { + oa = 0; + ooa = 0; + oa += ((i >> 4 & 3) == a) + ((i & 3) == a); + ooa += ((i >> 4 & 3) > a) + ((i & 3) > a); + for (b = 0; b < 4; ++b) + { + oob = ob = 0; + oob += ((i >> 6 & 3) > b && (i >> 4 & 3) == a) + ((i >> 2 & 3) > b && (i & 3) == a); + ob += ((i >> 6 & 3) == b && (i >> 4 & 3) == a) + ((i >> 2 & 3) == b && (i & 3) == a); + + ti = a << 2 | b; + fmt->cnt_occ[ti][i]= ob << 24 | oob << 16 | oa << 8 | ooa ; + } + } + } +} + void dump_fmt(const char *fn, const FMTIndex *fmt) { FILE *fp; @@ -147,6 +175,7 @@ FMTIndex *restore_fmt(const char *fn) fmt->seq_len = fmt->L2[4]; fclose(fp); fmt_gen_cnt_table(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量 + fmt_gen_cnt_occ(fmt); return fmt; } @@ -336,10 +365,10 @@ void fmt_e2_occ4(const FMTIndex *fmt, bwtint_t k, int b, uint32_t cnt1[4], uint3 void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b, uint32_t cntk1[4], uint32_t cntk2[4], uint32_t cntl1[4], uint32_t cntl2[4]) { - // fmt_e2_occ4(fmt, k, b, cntk1, cntk2); - // fmt_e2_occ4(fmt, l, b, cntl1, cntl2); - // return; - //double tm_t = realtime(); + fmt_e2_occ4(fmt, k, b, cntk1, cntk2); + fmt_e2_occ4(fmt, l, b, cntl1, cntl2); + return; + // double tm_t = realtime(); bwtint_t _k, _l; _k = k - (k >= fmt->primary); // 换算成了seq的行 _l = l - (l >= fmt->primary); @@ -418,7 +447,7 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b, // cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl; // cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl; } - //t6 += realtime() - tm_t; + // t6 += realtime() - tm_t; // f2 += 1; } @@ -453,7 +482,7 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4]) // 对k行和l行同时计算bwt str的occ,如果k和l落在同一个interval区间,可以减少一些计算量和访存 void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4]) { - //double tm_t = realtime(); + double tm_t = realtime(); bwtint_t _k, _l; _k = k - (k >= fmt->primary); // 换算成了seq的行 _l = l - (l >= fmt->primary); @@ -500,11 +529,11 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], cntl[2] += y1 >> 16 & 0xff; cntl[3] += y1 >> 24; } - //t11 += realtime() - tm_t; + t11 += realtime() - tm_t; } // 扩展一个碱基 -void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1) +void fmt_extend1_1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1) { uint32_t tk[4], tl[4]; int i; @@ -525,7 +554,7 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac } // 扩展两个碱基 -void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1, int b2) +void fmt_extend2_1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int b1, int b2) { uint32_t tk1[4], tl1[4], tk2[4], tl2[4]; int i; @@ -539,8 +568,8 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链 ok[i].x[2] = tl2[i] - tk2[i]; // 间隔 } - ok[b2].x[!is_back] = fmt->L2[b2] + 1 + tk2[b2]; - ok[b2].x[2] = tl2[b2] - tk2[b2]; + // ok[b2].x[!is_back] = fmt->L2[b2] + 1 + tk2[b2]; + // ok[b2].x[2] = tl2[b2] - tk2[b2]; // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 @@ -558,10 +587,130 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac *ik = ok[b2]; } + +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c & 2) ? y : ~y) >> 1 & ((c & 1) ? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +// 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ +void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) +{ + uint32_t x; + uint32_t *p, *q, tmp, *end; + bwtint_t bwt_k_line = k, bwt_k_base_line = k >> OCC_INTV_SHIFT << OCC_INTV_SHIFT; + int i, ti; + cnt[0] = 0; + cnt[1] = 0; + cnt[2] = 0; + if (k == (bwtint_t)(-1)) + { + p = fmt->bwt + 4 + b1 * 4; + for (i = 3; i > b2; --i) + cnt[2] += p[i]; + cnt[3] = p[b2]; + return; + } + ti = b1 << 2 | b2; + _mm_prefetch((const char *)(&fmt->cnt_occ[ti]), _MM_HINT_T0); + k -= (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + p = fmt_occ_intv(fmt, k); + // cout << "occ: " << p[0] << ' ' << p[1] << ' ' << p[2] << ' ' << p[3] << endl; + for (i = 3; i > b1; --i) cnt[0] += p[i]; + cnt[1] = p[b1]; + q = p + 4 + b1 * 4; + for (i = 3; i > b2; --i) cnt[2] += q[i]; + cnt[3] = q[b2]; + p += 20; // 该地址是bwt和pre_bwt字符串数据的首地址 + end = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); // this is the end point of the following loop + //if ((int)(end - p) > 8) + // cout << "interval: " << (int)(end - p) << endl; + + // end = p + 2 + (end - p) / 6; + // end = p + 2; + for (x = 0; p < end; ++p) + { + x += __fmt_occ_e2_aux2(fmt, ti, *p); + } + tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); + // print_base_uint32(tmp); + x += __fmt_occ_e2_aux2(fmt, ti, tmp); + if (b1 == 0) + { + x -= (~k & 7) << 8; + if (b2 == 0) + x -= (~k & 7) << 24; + } + + // 如果跨过了second_primary,那么可能需要减掉一次累积值 + if (b1 == fmt->first_base && bwt_k_base_line < fmt->sec_primary && bwt_k_line >= fmt->sec_primary) + { + if (b2 == fmt->last_base) + x -= 1 << 24; + else if (b2 < fmt->last_base) + x -= 1 << 16; + } + + cnt[0] += x & 0xff; + cnt[1] += x >> 8 & 0xff; + cnt[2] += x >> 16 & 0xff; + cnt[3] += x >> 24 & 0xff; +} + +void fmt_e2_2occ(const FMTIndex *fmt, bwtint_t k, bwtint_t l, + int b1, int b2, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + // double tm_t = realtime(); + fmt_e2_occ(fmt, k, b1, b2, cntk); + fmt_e2_occ(fmt, l, b1, b2, cntl); + return; + bwtint_t _k, _l; + _k = k - (k >= fmt->primary); // 换算成了seq的行 + _l = l - (l >= fmt->primary); + if (_l >> OCC_INTV_SHIFT != _k >> OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) + { + fmt_e2_occ(fmt, k, b1, b2, cntk); + fmt_e2_occ(fmt, l, b1, b2, cntl); + } + else + { + } +} + +// 扩展两个碱基 +void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2) +{ + bwtint_t tk[4], tl[4], first_pos; + // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 + fmt_e2_2occ(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, b2, tk, tl); + + // 这里是反向扩展 + ok->x[!is_back] = fmt->L2[b2] + 1 + tk[3]; + ok->x[2] = tl[3] - tk[3]; + + // 第一次正向扩展 + ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + // 第二次正向扩展 + // cout << "inteval-1: " << tl[0] - tk[0] << endl; + // cout << (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) << endl; + // cout << (ok->x[!is_back] <= fmt->primary && ok->x[!is_back] + ok->x[2] - 1 >= fmt->primary) << ' ' << ok->x[is_back] << endl; + // cout << "inteval-2: " << tl[2] - tk[2] << endl; + // cout << "occ: " << tl[3] - tk[3] << endl; + + first_pos = fmt->L2[b1] + 1 + tk[1]; + ok->x[is_back] = ok->x[is_back] + (first_pos <= fmt->primary && first_pos + tl[1] - tk[1] - 1 >= fmt->primary) + tl[2] - tk[2]; +} + // 利用fmt搜索seed,完整搜索,只需要单向搜索 bwtintv_t fmt_search(FMTIndex *fmt, const string &q) { - bwtintv_t ik, ok[4]; + bwtintv_t ik; + // bwtintv_t ok[4]; + bwtintv_t ok; int i, c1, c2, x = 0; int qlen = (int)q.size(); @@ -577,10 +726,12 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) c1 = cbval(q[i]); c2 = cbval(q[i + 1]); //double tm_t = realtime(); - fmt_extend2(fmt, &ik, ok, 0, c1, c2); - //t8 += realtime() - tm_t; + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + //fmt_extend2_1(fmt, &ik, ok, 0, c1, c2); + ik = ok; + // t8 += realtime() - tm_t; ik.info = i + 1; - //cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } else { @@ -591,7 +742,7 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) { // 最后一次扩展 c1 = cbval(q[i]); //double tm_t = realtime(); - fmt_extend1(fmt, &ik, ok, 0, c1); + // fmt_extend1(fmt, &ik, ok, 0, c1); //t9 += realtime() - tm_t; ik.info = i + 1; // cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; @@ -616,6 +767,7 @@ int main_fmtidx(int argc, char **argv) FMTIndex *fmt = restore_fmt(fmt_idx.c_str()); vector seed_arr(10000000); + seed_arr[0] = "CCGTCATCATCCG"; srand(time(NULL)); t1 = realtime(); @@ -624,34 +776,33 @@ int main_fmtidx(int argc, char **argv) t1 = realtime() - t1; cout << "[time gen seed:] " << t1 << "s" << endl; - // t2 = realtime(); - // for (int i = 0; i < seed_arr.size(); ++i) - // bwt_search(bwt, seed_arr[i]); - // t2 = realtime() - t2; - // cout << "[time bwt search:] " << t2 << "s" << endl; + // t2 = realtime(); + // for (int i = 0; i < seed_arr.size(); ++i) + // bwt_search(bwt, seed_arr[i]); + // t2 = realtime() - t2; + // cout << "[time bwt search:] " << t2 << "s" << endl; t4 = realtime(); for (int i = 0; i < (int)seed_arr.size(); ++i) { - // bwtintv_t p1 = + bwtintv_t p1 = bwt_search2(bwt, seed_arr[i]); - // bwtintv_t p2 = fmt_search(fmt, seed_arr[i]); - // if (p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2] || p1.x[3] != p2.x[3]) - // cout << seed_arr[i] << endl - // << p1.x[1] << ' ' << p1.x[2] << ' ' << p1.x[3] << endl - // << p2.x[1] << ' ' << p2.x[2] << ' ' << p2.x[3] << endl; + bwtintv_t p2 = fmt_search(fmt, seed_arr[i]); + if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) + cout << seed_arr[i] << endl + << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl + << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; } t4 = realtime() - t4; cout << "[time bwt search 2:] " << t4 << "s" << endl; - t3 = realtime(); for (int i = 0; i < (int)seed_arr.size(); ++i) fmt_search(fmt, seed_arr[i]); t3 = realtime() - t3; cout << "[time fmt search:] " << t3 << "s" << endl; - cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl; - cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl; + //cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl; + //cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl; // bwt_search(bwt, s); // bwt_search2(bwt, s); // for (int i = 0; i < 120; ++i) diff --git a/fmt_index.h b/fmt_index.h index cd39e27..b24cab3 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -11,6 +11,9 @@ #define __fmt_occ_e2_aux4(fmt, b, val) \ ((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24]) +#define __fmt_occ_e2_aux2(fmt, b, val) \ + ((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24]) + // fm-index, extend twice in one search step (one memory access) struct FMTIndex { @@ -22,6 +25,7 @@ struct FMTIndex uint32_t *bwt; // BWT // occurance array, separated to two parts uint32_t cnt_table[5][256]; // 4对应原来的cnt_table,0,1,2,3,分别对应该碱基的扩展 + uint32_t cnt_occ[16][256]; // 前16-24位表示b(碱基)的occ,8-16位表示大于b的occ,0-8表示大于a的occ,ba格式 uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 uint8_t last_base; // dollar转换成的base