结果正确性没有问题,正在测试速度,稍快一些
This commit is contained in:
parent
c69cb901bb
commit
22b75d335b
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
// 使用 IntelliSense 了解相关属性。
|
||||||
|
// 悬停以查看现有属性的描述。
|
||||||
|
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "fmtidx",
|
||||||
|
"type": "cppdbg",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${workspaceRoot}/fmtidx",
|
||||||
|
"args": [
|
||||||
|
"/home/zzh/data/reference/human_g1k_v37_decoy.fasta.bwt"
|
||||||
|
],
|
||||||
|
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
3
Makefile
3
Makefile
|
|
@ -1,6 +1,7 @@
|
||||||
CC= g++
|
CC= g++
|
||||||
NOWARN= -Wno-unused-result -Wno-unused-function
|
NOWARN= -Wno-unused-result -Wno-unused-function
|
||||||
CFLAGS= -g -Wall $(NOWARN) -O2
|
CFLAGS= #-g -Wall $(NOWARN) #-O2
|
||||||
|
CPPFLAGS= -g -Wall $(NOWARN) -O2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
SHOW_PERF= -DSHOW_PERF
|
SHOW_PERF= -DSHOW_PERF
|
||||||
AR= ar
|
AR= ar
|
||||||
|
|
|
||||||
23
bwt.cpp
23
bwt.cpp
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
void dump_bwt(const char *fn, const bwt_t *bwt)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
fp = xopen(fn, "wb");
|
fp = xopen(fn, "wb");
|
||||||
|
|
@ -166,6 +166,7 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
||||||
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
// an analogy to bwt_occ4() but more efficient, requiring k <= l
|
||||||
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
||||||
{
|
{
|
||||||
|
// double tm_t = realtime();
|
||||||
bwtint_t _k, _l;
|
bwtint_t _k, _l;
|
||||||
_k = k - (k >= bwt->primary);
|
_k = k - (k >= bwt->primary);
|
||||||
_l = l - (l >= bwt->primary);
|
_l = l - (l >= bwt->primary);
|
||||||
|
|
@ -206,6 +207,8 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin
|
||||||
cntl[2] += y >> 16 & 0xff;
|
cntl[2] += y >> 16 & 0xff;
|
||||||
cntl[3] += y >> 24;
|
cntl[3] += y >> 24;
|
||||||
}
|
}
|
||||||
|
// t5 += realtime() - tm_t;
|
||||||
|
// f1 += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
|
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
|
||||||
|
|
@ -236,7 +239,7 @@ void bwt_search(bwt_t *bwt, const string &q)
|
||||||
|
|
||||||
bwt_set_intv(bwt, bval(q[x]), ik);
|
bwt_set_intv(bwt, bval(q[x]), ik);
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
|
|
||||||
for (i = x + 1; i < q.size(); ++i)
|
for (i = x + 1; i < q.size(); ++i)
|
||||||
{
|
{
|
||||||
|
|
@ -246,7 +249,7 @@ void bwt_search(bwt_t *bwt, const string &q)
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
ik = ok[c];
|
ik = ok[c];
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << "bwt-1: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -284,14 +287,14 @@ void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back,
|
||||||
}
|
}
|
||||||
|
|
||||||
// 每次扩展两步
|
// 每次扩展两步
|
||||||
void bwt_search2(bwt_t *bwt, const string &q)
|
bwtintv_t bwt_search2(bwt_t *bwt, const string &q)
|
||||||
{
|
{
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
int i, j, c1, c2, x = 0;
|
int i, j, c1, c2, x = 0;
|
||||||
|
|
||||||
bwt_set_intv(bwt, bval(q[x]), ik);
|
bwt_set_intv(bwt, bval(q[x]), ik);
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
|
|
||||||
for (i = x + 1; i + 1 < q.size(); i += 2)
|
for (i = x + 1; i + 1 < q.size(); i += 2)
|
||||||
{
|
{
|
||||||
|
|
@ -301,11 +304,13 @@ void bwt_search2(bwt_t *bwt, const string &q)
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
c2 = cbval(q[i + 1]);
|
c2 = cbval(q[i + 1]);
|
||||||
|
|
||||||
|
// double tm_t = realtime();
|
||||||
bwt_extend2(bwt, &ik, ok, 0, c1);
|
bwt_extend2(bwt, &ik, ok, 0, c1);
|
||||||
|
// t7 += realtime() - tm_t;
|
||||||
|
|
||||||
ik = ok[c2];
|
ik = ok[c2];
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -315,9 +320,13 @@ void bwt_search2(bwt_t *bwt, const string &q)
|
||||||
if (i < q.size() && bval(q[i]) < 4)
|
if (i < q.size() && bval(q[i]) < 4)
|
||||||
{ // 最后一次扩展
|
{ // 最后一次扩展
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
|
// double tm_t = realtime();
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
bwt_extend(bwt, &ik, ok, 0);
|
||||||
|
// t10 += realtime() - tm_t;
|
||||||
ik = ok[c1];
|
ik = ok[c1];
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << "bwt-2: " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
|
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
|
return ik;
|
||||||
}
|
}
|
||||||
4
bwt.h
4
bwt.h
|
|
@ -7,7 +7,7 @@
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
// occ间隔对应的右移位数,5表示间隔32行保存一次
|
// occ间隔对应的右移位数,5表示间隔32行保存一次
|
||||||
#define OCC_INTV_SHIFT 5
|
#define OCC_INTV_SHIFT 7
|
||||||
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
|
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
|
||||||
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
||||||
|
|
||||||
|
|
@ -63,6 +63,6 @@ void create_interval_occ_bwt(bwt_t *bwt);
|
||||||
// 利用bwt搜索seed,完整搜索,只需要单向搜索
|
// 利用bwt搜索seed,完整搜索,只需要单向搜索
|
||||||
void bwt_search(bwt_t *bwt, const string &q);
|
void bwt_search(bwt_t *bwt, const string &q);
|
||||||
// 每次扩展两步
|
// 每次扩展两步
|
||||||
void bwt_search2(bwt_t *bwt, const string &q);
|
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
137
fmt_index.cpp
137
fmt_index.cpp
|
|
@ -12,13 +12,18 @@
|
||||||
#include "fmt_index.h"
|
#include "fmt_index.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0,
|
||||||
|
t11 = 0, t12 = 0, t13 = 0, t14 = 0;
|
||||||
|
|
||||||
|
long f1 = 0, f2 = 0, f3 = 0, f4 = 0, f5 = 0;
|
||||||
|
|
||||||
const char BASE[4] = {'A', 'C', 'G', 'T'};
|
const char BASE[4] = {'A', 'C', 'G', 'T'};
|
||||||
|
|
||||||
// 求反向互补序列
|
// 求反向互补序列
|
||||||
string calc_reverse_seq(string &seq)
|
string calc_reverse_seq(string &seq)
|
||||||
{
|
{
|
||||||
string rseq(seq.size(), '0');
|
string rseq(seq.size(), '0');
|
||||||
for (int i = 0; i < seq.size(); ++i)
|
for (size_t i = 0; i < seq.size(); ++i)
|
||||||
{
|
{
|
||||||
if (seq[i] == 'A')
|
if (seq[i] == 'A')
|
||||||
rseq[i] = 'T';
|
rseq[i] = 'T';
|
||||||
|
|
@ -60,7 +65,7 @@ void create_bwt_mtx(string &seq)
|
||||||
bwtint_t seq_len = seq.size() + 1;
|
bwtint_t seq_len = seq.size() + 1;
|
||||||
string sarr[seq_len];
|
string sarr[seq_len];
|
||||||
sarr[0] = seq + '$';
|
sarr[0] = seq + '$';
|
||||||
for (int i = 1; i < seq_len; ++i)
|
for (bwtint_t i = 1; i < seq_len; ++i)
|
||||||
{
|
{
|
||||||
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
|
sarr[i] = sarr[0].substr(i) + sarr[0].substr(0, i);
|
||||||
}
|
}
|
||||||
|
|
@ -148,7 +153,7 @@ FMTIndex *restore_fmt(const char *fn)
|
||||||
// 根据interval-bwt创建fmt-index
|
// 根据interval-bwt创建fmt-index
|
||||||
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
||||||
{
|
{
|
||||||
FILE *fmt_out = fopen("fmt.txt", "w");
|
// FILE *fmt_out = fopen("fmt.txt", "w");
|
||||||
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
|
FMTIndex *fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
|
||||||
fmt_gen_cnt_table(fmt);
|
fmt_gen_cnt_table(fmt);
|
||||||
|
|
||||||
|
|
@ -334,6 +339,7 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
|
||||||
// fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
|
// fmt_e2_occ4(fmt, k, b, cntk1, cntk2);
|
||||||
// fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
|
// fmt_e2_occ4(fmt, l, b, cntl1, cntl2);
|
||||||
// return;
|
// return;
|
||||||
|
//double tm_t = realtime();
|
||||||
bwtint_t _k, _l;
|
bwtint_t _k, _l;
|
||||||
_k = k - (k >= fmt->primary); // 换算成了seq的行
|
_k = k - (k >= fmt->primary); // 换算成了seq的行
|
||||||
_l = l - (l >= fmt->primary);
|
_l = l - (l >= fmt->primary);
|
||||||
|
|
@ -412,6 +418,8 @@ void fmt_e2_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, int b,
|
||||||
// cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl;
|
// cout << "fmt-occ: " << l << '\t' << cntl1[0] << '\t' << cntl1[1] << '\t' << cntl1[2] << '\t' << cntl1[3] << endl;
|
||||||
// cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl;
|
// cout << "fmt-occ-2: " << l << '\t' << cntl2[0] << '\t' << cntl2[1] << '\t' << cntl2[2] << '\t' << cntl2[3] << endl;
|
||||||
}
|
}
|
||||||
|
//t6 += realtime() - tm_t;
|
||||||
|
// f2 += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 扩展一个碱基,计算bwt str中各个碱基的occ
|
// 扩展一个碱基,计算bwt str中各个碱基的occ
|
||||||
|
|
@ -445,6 +453,7 @@ void fmt_e1_occ4(const FMTIndex *fmt, bwtint_t k, uint32_t cnt[4])
|
||||||
// 对k行和l行同时计算bwt str的occ,如果k和l落在同一个interval区间,可以减少一些计算量和访存
|
// 对k行和l行同时计算bwt str的occ,如果k和l落在同一个interval区间,可以减少一些计算量和访存
|
||||||
void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4])
|
void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4], uint32_t cntl[4])
|
||||||
{
|
{
|
||||||
|
//double tm_t = realtime();
|
||||||
bwtint_t _k, _l;
|
bwtint_t _k, _l;
|
||||||
_k = k - (k >= fmt->primary); // 换算成了seq的行
|
_k = k - (k >= fmt->primary); // 换算成了seq的行
|
||||||
_l = l - (l >= fmt->primary);
|
_l = l - (l >= fmt->primary);
|
||||||
|
|
@ -456,7 +465,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
uint32_t x1, y1;
|
uint32_t x1, y1;
|
||||||
uint32_t *p, tmp, *endk, *endl;
|
uint32_t *p, tmp, *ek, *el;
|
||||||
|
|
||||||
k -= (k >= fmt->primary); // because $ is not in bwt
|
k -= (k >= fmt->primary); // because $ is not in bwt
|
||||||
l -= (l >= fmt->primary);
|
l -= (l >= fmt->primary);
|
||||||
|
|
@ -465,21 +474,21 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
|
||||||
memcpy(cntl, p, 4 * sizeof(uint32_t));
|
memcpy(cntl, p, 4 * sizeof(uint32_t));
|
||||||
p += 20;
|
p += 20;
|
||||||
// prepare cntk[]
|
// prepare cntk[]
|
||||||
endk = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
|
ek = p + ((k >> 3) - ((k & ~OCC_INTV_MASK) >> 3));
|
||||||
endl = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
|
el = p + ((l >> 3) - ((l & ~OCC_INTV_MASK) >> 3));
|
||||||
for (x1 = 0; p < endk; ++p)
|
for (x1 = 0; p < ek; ++p)
|
||||||
{
|
{
|
||||||
x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
|
x1 += __fmt_occ_e2_aux4(fmt, 4, *p);
|
||||||
}
|
}
|
||||||
y1 = x1;
|
y1 = x1;
|
||||||
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
|
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
|
||||||
x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
|
x1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
|
||||||
for (; p < endl; ++p)
|
for (; p < el; ++p)
|
||||||
{
|
{
|
||||||
y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
|
y1 += __fmt_occ_e2_aux4(fmt, 4, *p);
|
||||||
}
|
}
|
||||||
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
|
tmp = *p & ~((1U << ((~l & 7) << 2)) - 1);
|
||||||
y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~k & 7);
|
y1 += __fmt_occ_e2_aux4(fmt, 4, tmp) - (~l & 7);
|
||||||
|
|
||||||
cntk[0] += x1 & 0xff;
|
cntk[0] += x1 & 0xff;
|
||||||
cntk[1] += x1 >> 8 & 0xff;
|
cntk[1] += x1 >> 8 & 0xff;
|
||||||
|
|
@ -491,6 +500,7 @@ void fmt_e1_2occ4(const FMTIndex *fmt, bwtint_t k, bwtint_t l, uint32_t cntk[4],
|
||||||
cntl[2] += y1 >> 16 & 0xff;
|
cntl[2] += y1 >> 16 & 0xff;
|
||||||
cntl[3] += y1 >> 24;
|
cntl[3] += y1 >> 24;
|
||||||
}
|
}
|
||||||
|
//t11 += realtime() - tm_t;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 扩展一个碱基
|
// 扩展一个碱基
|
||||||
|
|
@ -499,8 +509,8 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
|
||||||
uint32_t tk[4], tl[4];
|
uint32_t tk[4], tl[4];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk);
|
// fmt_e1_occ4(fmt, ik->x[!is_back] - 1, tk);
|
||||||
fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl);
|
// fmt_e1_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], tl);
|
||||||
fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
fmt_e1_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
||||||
for (i = 0; i != 4; ++i)
|
for (i = 0; i != 4; ++i)
|
||||||
{
|
{
|
||||||
|
|
@ -508,8 +518,9 @@ void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
|
||||||
ok[i].x[2] = tl[i] - tk[i]; // 间隔
|
ok[i].x[2] = tl[i] - tk[i]; // 间隔
|
||||||
}
|
}
|
||||||
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
|
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
|
||||||
for (i = 2; i >= b1; --i)
|
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
|
||||||
ok[i].x[is_back] = ok[i + 1].x[is_back] + ok[i + 1].x[2];
|
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
|
||||||
|
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
|
||||||
*ik = ok[b1];
|
*ik = ok[b1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -519,22 +530,18 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
|
||||||
uint32_t tk1[4], tl1[4], tk2[4], tl2[4];
|
uint32_t tk1[4], tl1[4], tk2[4], tl2[4];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1, b1, tk1, tk2);
|
|
||||||
// fmt_e2_occ4(fmt, ik->x[!is_back] - 1 + ik->x[2], b1, tl1, tl2);
|
|
||||||
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
||||||
fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2);
|
fmt_e2_2occ4(fmt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], b1, tk1, tk2, tl1, tl2);
|
||||||
|
|
||||||
// cout << "k: " << tk1[0] << '\t' << tk1[1] << '\t' << tk1[2] << '\t' << tk1[3] << endl;
|
|
||||||
// cout << "l: " << tl1[0] << '\t' << tl1[1] << '\t' << tl1[2] << '\t' << tl1[3] << endl;
|
|
||||||
// cout << "k: " << tk2[0] << '\t' << tk2[1] << '\t' << tk2[2] << '\t' << tk2[3] << endl;
|
|
||||||
// cout << "l: " << tl2[0] << '\t' << tl2[1] << '\t' << tl2[2] << '\t' << tl2[3] << endl;
|
|
||||||
|
|
||||||
// 这里是反向扩展
|
// 这里是反向扩展
|
||||||
for (i = 0; i != 4; ++i)
|
for (i = 0; i != 4; ++i)
|
||||||
{
|
{
|
||||||
ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链
|
ok[i].x[!is_back] = fmt->L2[i] + 1 + tk2[i]; // 起始行位置,互补链
|
||||||
ok[i].x[2] = tl2[i] - tk2[i]; // 间隔
|
ok[i].x[2] = tl2[i] - tk2[i]; // 间隔
|
||||||
}
|
}
|
||||||
|
ok[b2].x[!is_back] = fmt->L2[b2] + 1 + tk2[b2];
|
||||||
|
ok[b2].x[2] = tl2[b2] - tk2[b2];
|
||||||
|
|
||||||
// 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展
|
// 因为计算的是互补碱基,所以3对应着0,2对应1,下边是正向扩展
|
||||||
|
|
||||||
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
|
ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary);
|
||||||
|
|
@ -542,7 +549,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
|
||||||
ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2];
|
ok[1].x[is_back] = ok[2].x[is_back] + tl1[2] - tk1[2];
|
||||||
ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1];
|
ok[0].x[is_back] = ok[1].x[is_back] + tl1[1] - tk1[1];
|
||||||
|
|
||||||
cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl;
|
// cout << "fmt-d: " << BASE[b1] << '\t' << ok[b1].x[is_back] << '\t' << ok[b1].x[2] << endl;
|
||||||
|
|
||||||
ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary);
|
ok[3].x[is_back] = ok[b1].x[is_back] + (ok[b1].x[!is_back] <= fmt->primary && ok[b1].x[!is_back] + ok[b1].x[2] - 1 >= fmt->primary);
|
||||||
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
|
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
|
||||||
|
|
@ -552,39 +559,45 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t ok[4], int is_bac
|
||||||
*ik = ok[b2];
|
*ik = ok[b2];
|
||||||
}
|
}
|
||||||
// 利用fmt搜索seed,完整搜索,只需要单向搜索
|
// 利用fmt搜索seed,完整搜索,只需要单向搜索
|
||||||
void fmt_search(FMTIndex *fmt, const string &q)
|
bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
||||||
{
|
{
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
int i, j, c1, c2, x = 0;
|
int i, c1, c2, x = 0;
|
||||||
|
int qlen = (int)q.size();
|
||||||
|
|
||||||
fmt_set_intv(fmt, bval(q[x]), ik);
|
fmt_set_intv(fmt, bval(q[x]), ik);
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
|
|
||||||
for (i = x + 1; i + 1 < q.size(); i += 2)
|
for (i = x + 1; i + 1 < qlen; i += 2)
|
||||||
{
|
{
|
||||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||||
{
|
{
|
||||||
|
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
c2 = cbval(q[i + 1]);
|
c2 = cbval(q[i + 1]);
|
||||||
|
//double tm_t = realtime();
|
||||||
fmt_extend2(fmt, &ik, ok, 0, c1, c2);
|
fmt_extend2(fmt, &ik, ok, 0, c1, c2);
|
||||||
|
//t8 += realtime() - tm_t;
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
//cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (i < q.size() && bval(q[i]) < 4)
|
if (i < qlen && bval(q[i]) < 4)
|
||||||
{ // 最后一次扩展
|
{ // 最后一次扩展
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
|
//double tm_t = realtime();
|
||||||
fmt_extend1(fmt, &ik, ok, 0, c1);
|
fmt_extend1(fmt, &ik, ok, 0, c1);
|
||||||
|
//t9 += realtime() - tm_t;
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
// cout << "fmt : " << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
|
// cout << ik.x[0] << '\t' << ik.x[1] << '\t' << ik.x[2] << endl;
|
||||||
|
return ik;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main_fmtidx(int argc, char **argv)
|
int main_fmtidx(int argc, char **argv)
|
||||||
|
|
@ -596,37 +609,67 @@ int main_fmtidx(int argc, char **argv)
|
||||||
//create_bwt_mtx(seq);
|
//create_bwt_mtx(seq);
|
||||||
//cout << seq << endl;
|
//cout << seq << endl;
|
||||||
|
|
||||||
bwt_t *bwt = restore_bwt(argv[1]); // 读取bwt原始字符串(带ACGT总的累积量)
|
string bwt_idx = string(argv[1]) + ".bwt";
|
||||||
// create_interval_occ_bwt(bwt); // 根据bwt字符串创建包含interval occ的bwt(128碱基+ACGT累积量)
|
string fmt_idx = string(argv[1]) + ".fmt";
|
||||||
cout << "L2: " << bwt->L2[0] << '\t' << bwt->L2[1] << '\t' << bwt->L2[2] << '\t'
|
|
||||||
<< bwt->L2[3] << '\t' << bwt->L2[4] << endl;
|
|
||||||
|
|
||||||
string s = "AACCCTAA";
|
bwt_t *bwt = restore_bwt(bwt_idx.c_str());
|
||||||
|
FMTIndex *fmt = restore_fmt(fmt_idx.c_str());
|
||||||
|
|
||||||
|
vector<string> seed_arr(10000000);
|
||||||
srand(time(NULL));
|
srand(time(NULL));
|
||||||
s = generate_rand_seq(10);
|
|
||||||
cout << "seq: " << s << endl;
|
|
||||||
// s = "TTC";
|
|
||||||
|
|
||||||
bwt_search(bwt, s);
|
t1 = realtime();
|
||||||
bwt_search2(bwt, s);
|
for (int i = 0; i < (int)seed_arr.size(); ++i)
|
||||||
|
seed_arr[i] = generate_rand_seq(13);
|
||||||
|
t1 = realtime() - t1;
|
||||||
|
cout << "[time gen seed:] " << t1 << "s" << endl;
|
||||||
|
|
||||||
|
// t2 = realtime();
|
||||||
|
// for (int i = 0; i < seed_arr.size(); ++i)
|
||||||
|
// bwt_search(bwt, seed_arr[i]);
|
||||||
|
// t2 = realtime() - t2;
|
||||||
|
// cout << "[time bwt search:] " << t2 << "s" << endl;
|
||||||
|
|
||||||
|
t4 = realtime();
|
||||||
|
for (int i = 0; i < (int)seed_arr.size(); ++i)
|
||||||
|
{
|
||||||
|
// bwtintv_t p1 =
|
||||||
|
bwt_search2(bwt, seed_arr[i]);
|
||||||
|
// bwtintv_t p2 = fmt_search(fmt, seed_arr[i]);
|
||||||
|
// if (p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2] || p1.x[3] != p2.x[3])
|
||||||
|
// cout << seed_arr[i] << endl
|
||||||
|
// << p1.x[1] << ' ' << p1.x[2] << ' ' << p1.x[3] << endl
|
||||||
|
// << p2.x[1] << ' ' << p2.x[2] << ' ' << p2.x[3] << endl;
|
||||||
|
}
|
||||||
|
t4 = realtime() - t4;
|
||||||
|
cout << "[time bwt search 2:] " << t4 << "s" << endl;
|
||||||
|
|
||||||
|
t3 = realtime();
|
||||||
|
for (int i = 0; i < (int)seed_arr.size(); ++i)
|
||||||
|
fmt_search(fmt, seed_arr[i]);
|
||||||
|
t3 = realtime() - t3;
|
||||||
|
cout << "[time fmt search:] " << t3 << "s" << endl;
|
||||||
|
|
||||||
|
cout << "bwt occ: " << t5 << "s; " << t7 << '\t' << t10 << endl;
|
||||||
|
cout << "fmt occ: " << t6 << "s; " << t11 << '\t' << t8 << '\t' << t9 << endl;
|
||||||
|
// bwt_search(bwt, s);
|
||||||
|
// bwt_search2(bwt, s);
|
||||||
// for (int i = 0; i < 120; ++i)
|
// for (int i = 0; i < 120; ++i)
|
||||||
// {
|
// {
|
||||||
// cout << i << '\t' << bwt_B0(bwt, i) << endl;
|
// cout << i << '\t' << bwt_B0(bwt, i) << endl;
|
||||||
// }
|
// }
|
||||||
// TGGGAT
|
// TGGGAT
|
||||||
FMTIndex *fmt = create_fmt_from_bwt(bwt);
|
// FMTIndex *fmt = create_fmt_from_bwt(bwt);
|
||||||
dump_fmt("ref.fmt", fmt);
|
// dump_fmt("ref.fmt", fmt);
|
||||||
// FMTIndex *fmt = restore_fmt("tiny.fmt");
|
// FMTIndex *fmt = restore_fmt("tiny.fmt");
|
||||||
|
|
||||||
fmt_search(fmt, s);
|
//fmt_search(fmt, s);
|
||||||
// cout << bwt->bwt_size << endl;
|
// cout << bwt->bwt_size << endl;
|
||||||
// cout << bwt->seq_len << endl;
|
// cout << bwt->seq_len << endl;
|
||||||
cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl;
|
//cout << "sec_: " << fmt->sec_bcp << '\t' << fmt->sec_primary << endl;
|
||||||
|
|
||||||
uint8_t b8 = 2 << 4 | 2;
|
//uint8_t b8 = 2 << 4 | 2;
|
||||||
cout << "AGAG: " << fmt->cnt_table[2][b8] << endl;
|
//cout << "AGAG: " << fmt->cnt_table[2][b8] << endl;
|
||||||
cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl;
|
//cout << (((b8 >> 6) == 0 && (b8 >> 4 & 3) == 2) + ((b8 >> 2 & 3) == 0 && (b8 & 3) == 2)) << endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
3
util.h
3
util.h
|
|
@ -4,6 +4,9 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
extern double t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14;
|
||||||
|
extern long f1, f2, f3, f4, f5;
|
||||||
|
|
||||||
typedef uint64_t bwtint_t;
|
typedef uint64_t bwtint_t;
|
||||||
|
|
||||||
#define xassert(cond, msg) \
|
#define xassert(cond, msg) \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue