结果正确性没有问题,正在测试速度,稍快一些

This commit is contained in:
zzh 2024-01-28 14:51:40 +08:00
parent c69cb901bb
commit 22b75d335b
6 changed files with 131 additions and 57 deletions

18
.vscode/launch.json vendored 100644
View File

@ -0,0 +1,18 @@
{
// 使 IntelliSense
//
// 访: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "fmtidx",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/fmtidx",
"args": [
"/home/zzh/data/reference/human_g1k_v37_decoy.fasta.bwt"
],
"cwd": "${workspaceFolder}", //
}
]
}

View File

@ -1,6 +1,7 @@
CC= g++ CC= g++
NOWARN= -Wno-unused-result -Wno-unused-function NOWARN= -Wno-unused-result -Wno-unused-function
CFLAGS= -g -Wall $(NOWARN) -O2 CFLAGS= #-g -Wall $(NOWARN) #-O2
CPPFLAGS= -g -Wall $(NOWARN) -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF SHOW_PERF= -DSHOW_PERF
AR= ar AR= ar

23
bwt.cpp
View File

@ -10,7 +10,7 @@
using namespace std; using namespace std;
void bwt_dump_bwt(const char *fn, const bwt_t *bwt) void dump_bwt(const char *fn, const bwt_t *bwt)
{ {
FILE *fp; FILE *fp;
fp = xopen(fn, "wb"); fp = xopen(fn, "wb");
@ -166,6 +166,7 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
// an analogy to bwt_occ4() but more efficient, requiring k <= l // an analogy to bwt_occ4() but more efficient, requiring k <= l
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
{ {
// double tm_t = realtime();
bwtint_t _k, _l; bwtint_t _k, _l;
_k = k - (k >= bwt->primary); _k = k - (k >= bwt->primary);
_l = l - (l >= bwt->primary); _l = l - (l >= bwt->primary);
@ -206,6 +207,8 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin
cntl[2] += y >> 16 & 0xff; cntl[2] += y >> 16 & 0xff;
cntl[3] += y >> 24; cntl[3] += y >> 24;
} }
// t5 += realtime() - tm_t;
// f1 += 1;
} }
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
@ -236,7 +239,7 @@ void bwt_search(bwt_t *bwt, const string &q)
bwt_set_intv(bwt, bval(q[x]), ik); bwt_set_intv(bwt, bval(q[x]), ik);
ik.info = x + 1; ik.info = x + 1;
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i < q.size(); ++i) for (i = x + 1; i < q.size(); ++i)
{ {
@ -246,7 +249,7 @@ void bwt_search(bwt_t *bwt, const string &q)
bwt_extend(bwt, &ik, ok, 0); bwt_extend(bwt, &ik, ok, 0);
ik = ok[c]; ik = ok[c];
ik.info = i + 1; ik.info = i + 1;
cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
} }
} }
} }
@ -284,14 +287,14 @@ void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back,
} }
// 每次扩展两步 // 每次扩展两步
void bwt_search2(bwt_t *bwt, const string &q) bwtintv_t bwt_search2(bwt_t *bwt, const string &q)
{ {
bwtintv_t ik, ok[4]; bwtintv_t ik, ok[4];
int i, j, c1, c2, x = 0; int i, j, c1, c2, x = 0;
bwt_set_intv(bwt, bval(q[x]), ik); bwt_set_intv(bwt, bval(q[x]), ik);
ik.info = x + 1; ik.info = x + 1;
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i + 1 < q.size(); i += 2) for (i = x + 1; i + 1 < q.size(); i += 2)
{ {
@ -301,11 +304,13 @@ void bwt_search2(bwt_t *bwt, const string &q)
c1 = cbval(q[i]); c1 = cbval(q[i]);
c2 = cbval(q[i + 1]); c2 = cbval(q[i + 1]);
// double tm_t = realtime();
bwt_extend2(bwt, &ik, ok, 0, c1); bwt_extend2(bwt, &ik, ok, 0, c1);
// t7 += realtime() - tm_t;
ik = ok[c2]; ik = ok[c2];
ik.info = i + 1; ik.info = i + 1;
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
} }
else else
{ {
@ -315,9 +320,13 @@ void bwt_search2(bwt_t *bwt, const string &q)
if (i < q.size() && bval(q[i]) < 4) if (i < q.size() && bval(q[i]) < 4)
{ // 最后一次扩展 { // 最后一次扩展
c1 = cbval(q[i]); c1 = cbval(q[i]);
// double tm_t = realtime();
bwt_extend(bwt, &ik, ok, 0); bwt_extend(bwt, &ik, ok, 0);
// t10 += realtime() - tm_t;
ik = ok[c1]; ik = ok[c1];
ik.info = i + 1; ik.info = i + 1;
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
} }
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
return ik;
} }

4
bwt.h
View File

@ -7,7 +7,7 @@
using std::string; using std::string;
// occ间隔对应的右移位数5表示间隔32行保存一次 // occ间隔对应的右移位数5表示间隔32行保存一次
#define OCC_INTV_SHIFT 5 #define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) #define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1) #define OCC_INTV_MASK (OCC_INTERVAL - 1)
@ -63,6 +63,6 @@ void create_interval_occ_bwt(bwt_t *bwt);
// 利用bwt搜索seed完整搜索只需要单向搜索 // 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q); void bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步 // 每次扩展两步
void bwt_search2(bwt_t *bwt, const string &q); bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
#endif #endif

View File

@ -12,13 +12,18 @@
#include "fmt_index.h" #include "fmt_index.h"
using namespace std; using namespace std;
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
t11 = 0, t12 = 0, t13 = 0, t14 = 0;
long f1 = 0, f2 = 0, f3 = 0, f4 = 0, f5 = 0;
const char BASE[4] = {'A', 'C', 'G', 'T'}; const char BASE[4] = {'A', 'C', 'G', 'T'};
// 求反向互补序列 // 求反向互补序列
string calc_reverse_seq(string &seq) string calc_reverse_seq(string &seq)
{ {
string rseq(seq.size(), '0'); string rseq(seq.size(), '0');
for (int i = 0; i < seq.size(); ++i) for (size_t i = 0; i < seq.size(); ++i)
{ {
if (seq[i] == 'A') if (seq[i] == 'A')
rseq[i] = 'T'; rseq[i] = 'T';
@ -60,7 +65,7 @@ void create_bwt_mtx(string &seq)
bwtint_t seq_len = seq.size() + 1; bwtint_t seq_len = seq.size() + 1;
string sarr[seq_len]; string sarr[seq_len];
sarr[0] = seq + '$'; sarr[0] = seq + '$';
for (int i = 1; i < seq_len; ++i) for (bwtint_t i = 1; i < seq_len; ++i)
{ {
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i); sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
} }
@ -148,7 +153,7 @@ FMTIndex *restore_fmt(const char *fn)
// 根据interval-bwt创建fmt-index // 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt) FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
{ {
FILE *fmt_out = fopen("fmt.txt", "w"); // FILE *fmt_out = fopen("fmt.txt", "w");
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex)); FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
fmt_gen_cnt_table(fmt); fmt_gen_cnt_table(fmt);
@ -334,6 +339,7 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
// fmt_e2_occ4(fmt, k, b, cntk1, cntk2); // fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
// fmt_e2_occ4(fmt, l, b, cntl1, cntl2); // fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
// return; // return;
//double tm_t = realtime();
bwtint_t _k, _l; bwtint_t _k, _l;
_k = k - (k >= fmt->primary); // 换算成了seq的行 _k = k - (k >= fmt->primary); // 换算成了seq的行
_l = l - (l >= fmt->primary); _l = l - (l >= fmt->primary);
@ -412,6 +418,8 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
// cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl; // cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl;
// cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl; // cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl;
} }
//t6 += realtime() - tm_t;
// f2 += 1;
} }
// 扩展一个碱基计算bwt str中各个碱基的occ // 扩展一个碱基计算bwt str中各个碱基的occ
@ -445,6 +453,7 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
// 对k行和l行同时计算bwt str的occ如果k和l落在同一个interval区间可以减少一些计算量和访存 // 对k行和l行同时计算bwt str的occ如果k和l落在同一个interval区间可以减少一些计算量和访存
void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4]) void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4])
{ {
//double tm_t = realtime();
bwtint_t _k, _l; bwtint_t _k, _l;
_k = k - (k >= fmt->primary); // 换算成了seq的行 _k = k - (k >= fmt->primary); // 换算成了seq的行
_l = l - (l >= fmt->primary); _l = l - (l >= fmt->primary);
@ -456,7 +465,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
else else
{ {
uint32_t x1, y1; uint32_t x1, y1;
uint32_t *p, tmp, *endk, *endl; uint32_t *p, tmp, *ek, *el;
k -= (k >= fmt->primary); // because $ is not in bwt k -= (k >= fmt->primary); // because $ is not in bwt
l -= (l >= fmt->primary); l -= (l >= fmt->primary);
@ -465,21 +474,21 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
memcpy(cntl, p, 4 * sizeof(uint32_t)); memcpy(cntl, p, 4 * sizeof(uint32_t));
p += 20; p += 20;
// prepare cntk[] // prepare cntk[]
endk = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3)); ek = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
endl = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3)); el = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
for (x1 = 0; p < endk; ++p) for (x1 = 0; p < ek; ++p)
{ {
x1 += __fmt_occ_e2_aux4(fmt, 4, *p); x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
} }
y1 = x1; y1 = x1;
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7); x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
for (; p < endl; ++p) for (; p < el; ++p)
{ {
y1 += __fmt_occ_e2_aux4(fmt, 4, *p); y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
} }
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); tmp = *p & ~((1U << ((~l & 7) << 2)) - 1);
y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7); y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~l & 7);
cntk[0] += x1 & 0xff; cntk[0] += x1 & 0xff;
cntk[1] += x1 >> 8 & 0xff; cntk[1] += x1 >> 8 & 0xff;
@ -491,6 +500,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
cntl[2] += y1 >> 16 & 0xff; cntl[2] += y1 >> 16 & 0xff;
cntl[3] += y1 >> 24; cntl[3] += y1 >> 24;
} }
//t11 += realtime() - tm_t;
} }
// 扩展一个碱基 // 扩展一个碱基
@ -499,8 +509,8 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
uint32_t tk[4], tl[4]; uint32_t tk[4], tl[4];
int i; int i;
fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk); // fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk);
fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl); // fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl);
fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
for (i = 0; i != 4; ++i) for (i = 0; i != 4; ++i)
{ {
@ -508,8 +518,9 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
ok[i].x[2] = tl[i] - tk[i]; // 间隔 ok[i].x[2] = tl[i] - tk[i]; // 间隔
} }
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary); ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
for (i = 2; i >= b1; --i) ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[i].x[is_back] = ok[i + 1].x[is_back] + ok[i + 1].x[2]; ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
*ik = ok[b1]; *ik = ok[b1];
} }
@ -519,22 +530,18 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
uint32_t tk1[4], tl1[4], tk2[4], tl2[4]; uint32_t tk1[4], tl1[4], tk2[4], tl2[4];
int i; int i;
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2);
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2);
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积 // tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2); fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2);
// cout << "k: " << tk1[0] << '\t' << tk1[1] << '\t' << tk1[2] << '\t' << tk1[3] << endl;
// cout << "l: " << tl1[0] << '\t' << tl1[1] << '\t' << tl1[2] << '\t' << tl1[3] << endl;
// cout << "k: " << tk2[0] << '\t' << tk2[1] << '\t' << tk2[2] << '\t' << tk2[3] << endl;
// cout << "l: " << tl2[0] << '\t' << tl2[1] << '\t' << tl2[2] << '\t' << tl2[3] << endl;
// 这里是反向扩展 // 这里是反向扩展
for (i = 0; i != 4; ++i) for (i = 0; i != 4; ++i)
{ {
ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链 ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链
ok[i].x[2] = tl2[i] - tk2[i]; // 间隔 ok[i].x[2] = tl2[i] - tk2[i]; // 间隔
} }
ok[b2].x[!is_back] = fmt->L2[b2] + 1 + tk2[b2];
ok[b2].x[2] = tl2[b2] - tk2[b2];
// 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展 // 因为计算的是互补碱基所以3对应着0,2对应1下边是正向扩展
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary); ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
@ -542,7 +549,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2]; ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2];
ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1]; ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1];
cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl; // cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl;
ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary); ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary);
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
@ -552,39 +559,45 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
*ik = ok[b2]; *ik = ok[b2];
} }
// 利用fmt搜索seed完整搜索只需要单向搜索 // 利用fmt搜索seed完整搜索只需要单向搜索
void fmt_search(FMTIndex *fmt, const string &q) bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
{ {
bwtintv_t ik, ok[4]; bwtintv_t ik, ok[4];
int i, j, c1, c2, x = 0; int i, c1, c2, x = 0;
int qlen = (int)q.size();
fmt_set_intv(fmt, bval(q[x]), ik); fmt_set_intv(fmt, bval(q[x]), ik);
ik.info = x + 1; ik.info = x + 1;
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
for (i = x + 1; i + 1 < q.size(); i += 2) for (i = x + 1; i + 1 < qlen; i += 2)
{ {
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{ {
c1 = cbval(q[i]); c1 = cbval(q[i]);
c2 = cbval(q[i + 1]); c2 = cbval(q[i + 1]);
//double tm_t = realtime();
fmt_extend2(fmt, &ik, ok, 0, c1, c2); fmt_extend2(fmt, &ik, ok, 0, c1, c2);
//t8 += realtime() - tm_t;
ik.info = i + 1; ik.info = i + 1;
cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; //cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
} }
else else
{ {
break; break;
} }
} }
if (i < q.size() && bval(q[i]) < 4) if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展 { // 最后一次扩展
c1 = cbval(q[i]); c1 = cbval(q[i]);
//double tm_t = realtime();
fmt_extend1(fmt, &ik, ok, 0, c1); fmt_extend1(fmt, &ik, ok, 0, c1);
//t9 += realtime() - tm_t;
ik.info = i + 1; ik.info = i + 1;
cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl; // cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
} }
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
return ik;
} }
int main_fmtidx(int argc, char **argv) int main_fmtidx(int argc, char **argv)
@ -596,37 +609,67 @@ int main_fmtidx(int argc, char **argv)
//create_bwt_mtx(seq); //create_bwt_mtx(seq);
//cout << seq << endl; //cout << seq << endl;
bwt_t *bwt = restore_bwt(argv[1]); // 读取bwt原始字符串带ACGT总的累积量 string bwt_idx = string(argv[1]) + ".bwt";
// create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt128碱基+ACGT累积量 string fmt_idx = string(argv[1]) + ".fmt";
cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t'
<< bwt->L2[3] << '\t' << bwt->L2[4] << endl;
string s = "AACCCTAA"; bwt_t *bwt = restore_bwt(bwt_idx.c_str());
FMTIndex *fmt = restore_fmt(fmt_idx.c_str());
vector<string> seed_arr(10000000);
srand(time(NULL)); srand(time(NULL));
s = generate_rand_seq(10);
cout << "seq: " << s << endl;
// s = "TTC";
bwt_search(bwt, s); t1 = realtime();
bwt_search2(bwt, s); for (int i = 0; i < (int)seed_arr.size(); ++i)
seed_arr[i] = generate_rand_seq(13);
t1 = realtime() - t1;
cout << "[time gen seed:] " << t1 << "s" << endl;
// t2 = realtime();
// for (int i = 0; i < seed_arr.size(); ++i)
// bwt_search(bwt, seed_arr[i]);
// t2 = realtime() - t2;
// cout << "[time bwt search:] " << t2 << "s" << endl;
t4 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
{
// bwtintv_t p1 =
bwt_search2(bwt, seed_arr[i]);
// bwtintv_t p2 = fmt_search(fmt, seed_arr[i]);
// if (p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2] || p1.x[3] != p2.x[3])
// cout << seed_arr[i] << endl
// << p1.x[1] << ' ' << p1.x[2] << ' ' << p1.x[3] << endl
// << p2.x[1] << ' ' << p2.x[2] << ' ' << p2.x[3] << endl;
}
t4 = realtime() - t4;
cout << "[time bwt search 2:] " << t4 << "s" << endl;
t3 = realtime();
for (int i = 0; i < (int)seed_arr.size(); ++i)
fmt_search(fmt, seed_arr[i]);
t3 = realtime() - t3;
cout << "[time fmt search:] " << t3 << "s" << endl;
cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl;
cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl;
// bwt_search(bwt, s);
// bwt_search2(bwt, s);
// for (int i = 0; i < 120; ++i) // for (int i = 0; i < 120; ++i)
// { // {
// cout << i << '\t' << bwt_B0(bwt, i) << endl; // cout << i << '\t' << bwt_B0(bwt, i) << endl;
// } // }
// TGGGAT // TGGGAT
FMTIndex *fmt = create_fmt_from_bwt(bwt); // FMTIndex *fmt = create_fmt_from_bwt(bwt);
dump_fmt("ref.fmt", fmt); // dump_fmt("ref.fmt", fmt);
// FMTIndex *fmt = restore_fmt("tiny.fmt"); // FMTIndex *fmt = restore_fmt("tiny.fmt");
fmt_search(fmt, s); //fmt_search(fmt, s);
// cout << bwt->bwt_size << endl; // cout << bwt->bwt_size << endl;
// cout << bwt->seq_len << endl; // cout << bwt->seq_len << endl;
cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl; //cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl;
uint8_t b8 = 2 << 4 | 2; //uint8_t b8 = 2 << 4 | 2;
cout << "AGAG: " << fmt->cnt_table[2][b8] << endl; //cout << "AGAG: " << fmt->cnt_table[2][b8] << endl;
cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl; //cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl;
return 0; return 0;
} }

3
util.h
View File

@ -4,6 +4,9 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
extern double t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
extern long f1, f2, f3, f4, f5;
typedef uint64_t bwtint_t; typedef uint64_t bwtint_t;
#define xassert(cond, msg) \ #define xassert(cond, msg) \