diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..59cb991 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,18 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "fmtidx", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceRoot}/fmtidx", + "args": [ + "/home/zzh/data/reference/human_g1k_v37_decoy.fasta.bwt" + ], + "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 + } + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile index ce06fba..27f3eef 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ CC= g++ NOWARN= -Wno-unused-result -Wno-unused-function -CFLAGS= -g -Wall $(NOWARN) -O2 +CFLAGS= #-g -Wall $(NOWARN) #-O2 +CPPFLAGS= -g -Wall $(NOWARN) -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS SHOW_PERF= -DSHOW_PERF AR= ar diff --git a/bwt.cpp b/bwt.cpp index 7173131..73d39aa 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -10,7 +10,7 @@ using namespace std; -void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +void dump_bwt(const char *fn, const bwt_t *bwt) { FILE *fp; fp = xopen(fn, "wb"); @@ -166,6 +166,7 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) // an analogy to bwt_occ4() but more efficient, requiring k <= l void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { + // double tm_t = realtime(); bwtint_t _k, _l; _k = k - (k >= bwt->primary); _l = l - (l >= bwt->primary); @@ -206,6 +207,8 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin cntl[2] += y >> 16 & 0xff; cntl[3] += y >> 24; } + // t5 += realtime() - tm_t; + // f1 += 1; } void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) @@ -236,7 +239,7 @@ void bwt_search(bwt_t *bwt, const string &q) bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; - cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; for (i = x + 1; i < q.size(); ++i) { @@ -246,7 +249,7 @@ void bwt_search(bwt_t *bwt, const string &q) bwt_extend(bwt, &ik, ok, 0); ik = ok[c]; ik.info = i + 1; - cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } } } @@ -284,14 +287,14 @@ void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, } // 每次扩展两步 -void bwt_search2(bwt_t *bwt, const string &q) +bwtintv_t bwt_search2(bwt_t *bwt, const string &q) { bwtintv_t ik, ok[4]; int i, j, c1, c2, x = 0; bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; - cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; for (i = x + 1; i + 1 < q.size(); i += 2) { @@ -301,11 +304,13 @@ void bwt_search2(bwt_t *bwt, const string &q) c1 = cbval(q[i]); c2 = cbval(q[i + 1]); + // double tm_t = realtime(); bwt_extend2(bwt, &ik, ok, 0, c1); + // t7 += realtime() - tm_t; ik = ok[c2]; ik.info = i + 1; - cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } else { @@ -315,9 +320,13 @@ void bwt_search2(bwt_t *bwt, const string &q) if (i < q.size() && bval(q[i]) < 4) { // 最后一次扩展 c1 = cbval(q[i]); + // double tm_t = realtime(); bwt_extend(bwt, &ik, ok, 0); + // t10 += realtime() - tm_t; ik = ok[c1]; ik.info = i + 1; - cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } + // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + return ik; } \ No newline at end of file diff --git a/bwt.h b/bwt.h index 564b64d..35caf27 100644 --- a/bwt.h +++ b/bwt.h @@ -7,7 +7,7 @@ using std::string; // occ间隔对应的右移位数,5表示间隔32行保存一次 -#define OCC_INTV_SHIFT 5 +#define OCC_INTV_SHIFT 7 #define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) #define OCC_INTV_MASK (OCC_INTERVAL - 1) @@ -63,6 +63,6 @@ void create_interval_occ_bwt(bwt_t *bwt); // 利用bwt搜索seed,完整搜索,只需要单向搜索 void bwt_search(bwt_t *bwt, const string &q); // 每次扩展两步 -void bwt_search2(bwt_t *bwt, const string &q); +bwtintv_t bwt_search2(bwt_t *bwt, const string &q); #endif \ No newline at end of file diff --git a/fmt_index.cpp b/fmt_index.cpp index 4bc645e..99837b5 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -12,13 +12,18 @@ #include "fmt_index.h" using namespace std; +double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0, + t11 = 0, t12 = 0, t13 = 0, t14 = 0; + +long f1 = 0, f2 = 0, f3 = 0, f4 = 0, f5 = 0; + const char BASE[4] = {'A', 'C', 'G', 'T'}; // 求反向互补序列 string calc_reverse_seq(string &seq) { string rseq(seq.size(), '0'); - for (int i = 0; i < seq.size(); ++i) + for (size_t i = 0; i < seq.size(); ++i) { if (seq[i] == 'A') rseq[i] = 'T'; @@ -60,7 +65,7 @@ void create_bwt_mtx(string &seq) bwtint_t seq_len = seq.size() + 1; string sarr[seq_len]; sarr[0] = seq + '$'; - for (int i = 1; i < seq_len; ++i) + for (bwtint_t i = 1; i < seq_len; ++i) { sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i); } @@ -148,7 +153,7 @@ FMTIndex *restore_fmt(const char *fn) // 根据interval-bwt创建fmt-index FMTIndex *create_fmt_from_bwt(bwt_t *bwt) { - FILE *fmt_out = fopen("fmt.txt", "w"); + // FILE *fmt_out = fopen("fmt.txt", "w"); FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex)); fmt_gen_cnt_table(fmt); @@ -334,6 +339,7 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b, // fmt_e2_occ4(fmt, k, b, cntk1, cntk2); // fmt_e2_occ4(fmt, l, b, cntl1, cntl2); // return; + //double tm_t = realtime(); bwtint_t _k, _l; _k = k - (k >= fmt->primary); // 换算成了seq的行 _l = l - (l >= fmt->primary); @@ -412,6 +418,8 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b, // cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl; // cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl; } + //t6 += realtime() - tm_t; + // f2 += 1; } // 扩展一个碱基,计算bwt str中各个碱基的occ @@ -445,6 +453,7 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4]) // 对k行和l行同时计算bwt str的occ,如果k和l落在同一个interval区间,可以减少一些计算量和访存 void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4]) { + //double tm_t = realtime(); bwtint_t _k, _l; _k = k - (k >= fmt->primary); // 换算成了seq的行 _l = l - (l >= fmt->primary); @@ -456,7 +465,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], else { uint32_t x1, y1; - uint32_t *p, tmp, *endk, *endl; + uint32_t *p, tmp, *ek, *el; k -= (k >= fmt->primary); // because $ is not in bwt l -= (l >= fmt->primary); @@ -465,21 +474,21 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], memcpy(cntl, p, 4 * sizeof(uint32_t)); p += 20; // prepare cntk[] - endk = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); - endl = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3)); - for (x1 = 0; p < endk; ++p) + ek = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); + el = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3)); + for (x1 = 0; p < ek; ++p) { x1 += __fmt_occ_e2_aux4(fmt, 4, *p); } y1 = x1; tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7); - for (; p < endl; ++p) + for (; p < el; ++p) { y1 += __fmt_occ_e2_aux4(fmt, 4, *p); } - tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); - y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7); + tmp = *p & ~((1U << ((~l & 7) << 2)) - 1); + y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~l & 7); cntk[0] += x1 & 0xff; cntk[1] += x1 >> 8 & 0xff; @@ -491,6 +500,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], cntl[2] += y1 >> 16 & 0xff; cntl[3] += y1 >> 24; } + //t11 += realtime() - tm_t; } // 扩展一个碱基 @@ -499,8 +509,8 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac uint32_t tk[4], tl[4]; int i; - fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk); - fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl); + // fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk); + // fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl); fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); for (i = 0; i != 4; ++i) { @@ -508,8 +518,9 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac ok[i].x[2] = tl[i] - tk[i]; // 间隔 } ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary); - for (i = 2; i >= b1; --i) - ok[i].x[is_back] = ok[i + 1].x[is_back] + ok[i + 1].x[2]; + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; *ik = ok[b1]; } @@ -519,22 +530,18 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac uint32_t tk1[4], tl1[4], tk2[4], tl2[4]; int i; - // fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2); - // fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2); // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2); - // cout << "k: " << tk1[0] << '\t' << tk1[1] << '\t' << tk1[2] << '\t' << tk1[3] << endl; - // cout << "l: " << tl1[0] << '\t' << tl1[1] << '\t' << tl1[2] << '\t' << tl1[3] << endl; - // cout << "k: " << tk2[0] << '\t' << tk2[1] << '\t' << tk2[2] << '\t' << tk2[3] << endl; - // cout << "l: " << tl2[0] << '\t' << tl2[1] << '\t' << tl2[2] << '\t' << tl2[3] << endl; - // 这里是反向扩展 for (i = 0; i != 4; ++i) { ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链 ok[i].x[2] = tl2[i] - tk2[i]; // 间隔 } + ok[b2].x[!is_back] = fmt->L2[b2] + 1 + tk2[b2]; + ok[b2].x[2] = tl2[b2] - tk2[b2]; + // 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展 ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary); @@ -542,7 +549,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2]; ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1]; - cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl; + // cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl; ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary); ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; @@ -552,39 +559,45 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac *ik = ok[b2]; } // 利用fmt搜索seed,完整搜索,只需要单向搜索 -void fmt_search(FMTIndex *fmt, const string &q) +bwtintv_t fmt_search(FMTIndex *fmt, const string &q) { bwtintv_t ik, ok[4]; - int i, j, c1, c2, x = 0; + int i, c1, c2, x = 0; + int qlen = (int)q.size(); fmt_set_intv(fmt, bval(q[x]), ik); ik.info = x + 1; - cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; - for (i = x + 1; i + 1 < q.size(); i += 2) + for (i = x + 1; i + 1 < qlen; i += 2) { if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) { c1 = cbval(q[i]); c2 = cbval(q[i + 1]); - + //double tm_t = realtime(); fmt_extend2(fmt, &ik, ok, 0, c1, c2); + //t8 += realtime() - tm_t; ik.info = i + 1; - cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + //cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } else { break; } } - if (i < q.size() && bval(q[i]) < 4) + if (i < qlen && bval(q[i]) < 4) { // 最后一次扩展 c1 = cbval(q[i]); + //double tm_t = realtime(); fmt_extend1(fmt, &ik, ok, 0, c1); + //t9 += realtime() - tm_t; ik.info = i + 1; - cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + // cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; } + // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; + return ik; } int main_fmtidx(int argc, char **argv) @@ -596,37 +609,67 @@ int main_fmtidx(int argc, char **argv) //create_bwt_mtx(seq); //cout << seq << endl; - bwt_t *bwt = restore_bwt(argv[1]); // 读取bwt原始字符串(带ACGT总的累积量) - // create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt(128碱基+ACGT累积量) - cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t' - << bwt->L2[3] << '\t' << bwt->L2[4] << endl; + string bwt_idx = string(argv[1]) + ".bwt"; + string fmt_idx = string(argv[1]) + ".fmt"; - string s = "AACCCTAA"; + bwt_t *bwt = restore_bwt(bwt_idx.c_str()); + FMTIndex *fmt = restore_fmt(fmt_idx.c_str()); + vector seed_arr(10000000); srand(time(NULL)); - s = generate_rand_seq(10); - cout << "seq: " << s << endl; - // s = "TTC"; - bwt_search(bwt, s); - bwt_search2(bwt, s); + t1 = realtime(); + for (int i = 0; i < (int)seed_arr.size(); ++i) + seed_arr[i] = generate_rand_seq(13); + t1 = realtime() - t1; + cout << "[time gen seed:] " << t1 << "s" << endl; + // t2 = realtime(); + // for (int i = 0; i < seed_arr.size(); ++i) + // bwt_search(bwt, seed_arr[i]); + // t2 = realtime() - t2; + // cout << "[time bwt search:] " << t2 << "s" << endl; + + t4 = realtime(); + for (int i = 0; i < (int)seed_arr.size(); ++i) + { + // bwtintv_t p1 = + bwt_search2(bwt, seed_arr[i]); + // bwtintv_t p2 = fmt_search(fmt, seed_arr[i]); + // if (p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2] || p1.x[3] != p2.x[3]) + // cout << seed_arr[i] << endl + // << p1.x[1] << ' ' << p1.x[2] << ' ' << p1.x[3] << endl + // << p2.x[1] << ' ' << p2.x[2] << ' ' << p2.x[3] << endl; + } + t4 = realtime() - t4; + cout << "[time bwt search 2:] " << t4 << "s" << endl; + + t3 = realtime(); + for (int i = 0; i < (int)seed_arr.size(); ++i) + fmt_search(fmt, seed_arr[i]); + t3 = realtime() - t3; + cout << "[time fmt search:] " << t3 << "s" << endl; + + cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl; + cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl; + // bwt_search(bwt, s); + // bwt_search2(bwt, s); // for (int i = 0; i < 120; ++i) // { // cout << i << '\t' << bwt_B0(bwt, i) << endl; // } // TGGGAT - FMTIndex *fmt = create_fmt_from_bwt(bwt); - dump_fmt("ref.fmt", fmt); + // FMTIndex *fmt = create_fmt_from_bwt(bwt); + // dump_fmt("ref.fmt", fmt); // FMTIndex *fmt = restore_fmt("tiny.fmt"); - fmt_search(fmt, s); + //fmt_search(fmt, s); // cout << bwt->bwt_size << endl; // cout << bwt->seq_len << endl; - cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl; + //cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl; - uint8_t b8 = 2 << 4 | 2; - cout << "AGAG: " << fmt->cnt_table[2][b8] << endl; - cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl; + //uint8_t b8 = 2 << 4 | 2; + //cout << "AGAG: " << fmt->cnt_table[2][b8] << endl; + //cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl; return 0; } \ No newline at end of file diff --git a/util.h b/util.h index 2ff81da..6d061c2 100644 --- a/util.h +++ b/util.h @@ -4,6 +4,9 @@ #include #include +extern double t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14; +extern long f1, f2, f3, f4, f5; + typedef uint64_t bwtint_t; #define xassert(cond, msg) \