添加了14个碱基长度的kmer结果,还没有合并
This commit is contained in:
parent
46822bef2c
commit
1b77566567
17
bwt.cpp
17
bwt.cpp
|
|
@ -251,25 +251,9 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q)
|
||||||
bwtintv_t ik, ok[4];
|
bwtintv_t ik, ok[4];
|
||||||
int i, c, x = 0;
|
int i, c, x = 0;
|
||||||
|
|
||||||
//double tt = realtime();
|
|
||||||
|
|
||||||
bwt_set_intv(bwt, bval(q[x]), ik);
|
bwt_set_intv(bwt, bval(q[x]), ik);
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
|
|
||||||
for (i = x + 1; i < 12; ++i)
|
|
||||||
{
|
|
||||||
if (bval(q[i]) < 4)
|
|
||||||
{
|
|
||||||
c = cbval(q[i]);
|
|
||||||
bwt_extend(bwt, &ik, ok, 0);
|
|
||||||
ik = ok[c];
|
|
||||||
ik.info = i + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//t5 += realtime() - tt;
|
|
||||||
x = 11;
|
|
||||||
|
|
||||||
//tt = realtime();
|
|
||||||
for (i = x + 1; i < (int)q.size(); ++i)
|
for (i = x + 1; i < (int)q.size(); ++i)
|
||||||
{
|
{
|
||||||
if (bval(q[i]) < 4)
|
if (bval(q[i]) < 4)
|
||||||
|
|
@ -280,7 +264,6 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q)
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//t6 += realtime() - tt;
|
|
||||||
return ik;
|
return ik;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
185
fmt_index.cpp
185
fmt_index.cpp
|
|
@ -175,6 +175,26 @@ KmerEntry *restore_kmer_idx(const char *fn)
|
||||||
return kmer_entry;
|
return kmer_entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 将full kmer hash数据写入到文件
|
||||||
|
void dump_full_entry_idx(const char *fn, const FullEntry *full_entry)
|
||||||
|
{
|
||||||
|
FILE *fp;
|
||||||
|
fp = xopen(fn, "wb");
|
||||||
|
err_fwrite(full_entry, 1, FULL_ENTRY_ARR_SIZE * sizeof(FullEntry), fp);
|
||||||
|
err_fflush(fp);
|
||||||
|
err_fclose(fp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 从文件中读取full kmer hash信息
|
||||||
|
FullEntry *restore_full_entry_idx(const char *fn)
|
||||||
|
{
|
||||||
|
FILE *fp = xopen(fn, "rb");
|
||||||
|
uint32_t full_entry_bytes = FULL_ENTRY_ARR_SIZE * sizeof(FullEntry);
|
||||||
|
FullEntry *full_entry = (FullEntry *)malloc(full_entry_bytes);
|
||||||
|
fread_fix(fp, full_entry_bytes, full_entry);
|
||||||
|
err_fclose(fp);
|
||||||
|
return full_entry;
|
||||||
|
}
|
||||||
// 从文件中读取fmt结构数据
|
// 从文件中读取fmt结构数据
|
||||||
FMTIndex *restore_fmt(const char *fn)
|
FMTIndex *restore_fmt(const char *fn)
|
||||||
{
|
{
|
||||||
|
|
@ -475,7 +495,7 @@ void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]
|
||||||
}
|
}
|
||||||
|
|
||||||
// 扩展两个碱基
|
// 扩展两个碱基
|
||||||
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
|
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2)
|
||||||
{
|
{
|
||||||
bwtint_t tk[4], tl[4], first_pos;
|
bwtint_t tk[4], tl[4], first_pos;
|
||||||
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
||||||
|
|
@ -492,7 +512,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back,
|
||||||
}
|
}
|
||||||
|
|
||||||
// 扩展一个碱基
|
// 扩展一个碱基
|
||||||
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
|
inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
|
||||||
{
|
{
|
||||||
bwtint_t tk[4], tl[4];
|
bwtint_t tk[4], tl[4];
|
||||||
int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量
|
int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量
|
||||||
|
|
@ -514,29 +534,8 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
||||||
int i, c1, c2, x = 0;
|
int i, c1, c2, x = 0;
|
||||||
int qlen = (int)q.size();
|
int qlen = (int)q.size();
|
||||||
|
|
||||||
//double tt = realtime();
|
|
||||||
fmt_set_intv(fmt, bval(q[x]), ik);
|
fmt_set_intv(fmt, bval(q[x]), ik);
|
||||||
ik.info = x + 1;
|
ik.info = x + 1;
|
||||||
for (i = x + 1; i + 1 < 12; i += 2)
|
|
||||||
{
|
|
||||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
|
||||||
{
|
|
||||||
c1 = cbval(q[i]);
|
|
||||||
c2 = cbval(q[i + 1]);
|
|
||||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
|
||||||
ik = ok;
|
|
||||||
ik.info = i + 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
x = 11;
|
|
||||||
//t3 += realtime() - tt;
|
|
||||||
|
|
||||||
//tt = realtime();
|
|
||||||
// 每次扩展两个碱基
|
|
||||||
for (i = x + 1; i + 1 < qlen; i += 2)
|
for (i = x + 1; i + 1 < qlen; i += 2)
|
||||||
{
|
{
|
||||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||||
|
|
@ -546,20 +545,22 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
||||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||||
ik = ok;
|
ik = ok;
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
|
//cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < qlen && bval(q[i]) < 4)
|
if (i < qlen && bval(q[i]) < 4)
|
||||||
{ // 最后一次扩展
|
{ // 最后一次扩展
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||||
ik = ok;
|
ik = ok;
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
|
//cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl;
|
||||||
}
|
}
|
||||||
//t4 += realtime() - tt;
|
|
||||||
return ik;
|
return ik;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -595,6 +596,34 @@ inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos)
|
||||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void full_entry_set(FullEntry *fe, bwtintv_t ik)
|
||||||
|
{
|
||||||
|
uint8_t *arr = fe->intv_arr;
|
||||||
|
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||||
|
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||||
|
arr += 5;
|
||||||
|
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||||
|
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||||
|
arr += 5;
|
||||||
|
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void full_entry_get(FullEntry *fe, bwtintv_t *ok)
|
||||||
|
{
|
||||||
|
bwtint_t x0, x1, x2;
|
||||||
|
uint8_t *arr = fe->intv_arr;
|
||||||
|
x0 = *arr;
|
||||||
|
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||||
|
arr += 5;
|
||||||
|
x1 = *arr;
|
||||||
|
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||||
|
arr += 5;
|
||||||
|
x2 = *((uint32_t *)arr);
|
||||||
|
ok->x[0] = x0;
|
||||||
|
ok->x[1] = x1;
|
||||||
|
ok->x[2] = x2;
|
||||||
|
}
|
||||||
|
|
||||||
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
|
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
|
||||||
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
|
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
|
||||||
{
|
{
|
||||||
|
|
@ -687,10 +716,55 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
||||||
{ // 最后一次扩展
|
{ // 最后一次扩展
|
||||||
c1 = cbval(q[i]);
|
c1 = cbval(q[i]);
|
||||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||||
|
//fmt_extend2(fmt, &ik, &ok, 0, c1, 3);
|
||||||
ik = ok;
|
ik = ok;
|
||||||
ik.info = i + 1;
|
ik.info = i + 1;
|
||||||
}
|
}
|
||||||
//t2 += realtime() - tt;
|
//t2 += realtime() - tt;
|
||||||
|
//cout << ik.x[2] << endl;
|
||||||
|
return ik;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 利用fmt搜索seed,利用full entry加速搜索过程
|
||||||
|
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
|
||||||
|
{
|
||||||
|
bwtintv_t ik;
|
||||||
|
bwtintv_t ok;
|
||||||
|
int i, c1, c2, x = 0;
|
||||||
|
int qlen = (int)q.size();
|
||||||
|
|
||||||
|
// 先利用kmer进行索引查询
|
||||||
|
uint32_t qbit = 0;
|
||||||
|
x = min(FULL_ENTRY_LEN, qlen);
|
||||||
|
for (i = 0; i < x; ++i)
|
||||||
|
{
|
||||||
|
qbit |= bval(q[i]) << ((FULL_ENTRY_LEN - 1 - i) << 1);
|
||||||
|
}
|
||||||
|
full_entry_get(&fmt->full_entry[qbit], &ik);
|
||||||
|
|
||||||
|
// 每次扩展两个碱基
|
||||||
|
for (i = x; i + 1 < qlen; i += 2)
|
||||||
|
{
|
||||||
|
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||||
|
{
|
||||||
|
c1 = cbval(q[i]);
|
||||||
|
c2 = cbval(q[i + 1]);
|
||||||
|
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||||
|
ik = ok;
|
||||||
|
ik.info = i + 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i < qlen && bval(q[i]) < 4)
|
||||||
|
{ // 最后一次扩展
|
||||||
|
c1 = cbval(q[i]);
|
||||||
|
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||||
|
ik = ok;
|
||||||
|
ik.info = i + 1;
|
||||||
|
}
|
||||||
return ik;
|
return ik;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -721,6 +795,7 @@ void gen_all_seq(vector<string> &vseq, int kmer_len)
|
||||||
#define GEN_BWT_IDX 0
|
#define GEN_BWT_IDX 0
|
||||||
#define GEN_FMT_IDX 0
|
#define GEN_FMT_IDX 0
|
||||||
#define GEN_KMER_IDX 0
|
#define GEN_KMER_IDX 0
|
||||||
|
#define GEN_FULL_ENTRY_IDX 0
|
||||||
#define CMP_FMT_BWT_TIME 1
|
#define CMP_FMT_BWT_TIME 1
|
||||||
#define CMP_FMT_BWT_RESULT 1
|
#define CMP_FMT_BWT_RESULT 1
|
||||||
|
|
||||||
|
|
@ -729,13 +804,14 @@ int main_fmtidx(int argc, char **argv)
|
||||||
{
|
{
|
||||||
string prefix = argv[1];
|
string prefix = argv[1];
|
||||||
string bwt_str = prefix + ".bwt.str";
|
string bwt_str = prefix + ".bwt.str";
|
||||||
string bwt_idx, fmt_idx, kmer_idx;
|
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx;
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
FMTIndex *fmt;
|
FMTIndex *fmt;
|
||||||
ostringstream oss_bwt, oss_fmt, oss_kmer;
|
ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry;
|
||||||
oss_bwt << '.' << OCC_INTERVAL;
|
oss_bwt << '.' << OCC_INTERVAL;
|
||||||
oss_fmt << '.' << FMT_OCC_INTERVAL;
|
oss_fmt << '.' << FMT_OCC_INTERVAL;
|
||||||
oss_kmer << '.' << KMER_LEN;
|
oss_kmer << '.' << KMER_LEN;
|
||||||
|
oss_full_entry << '.' << FULL_ENTRY_LEN;
|
||||||
#ifdef FMT_MID_INTERVAL
|
#ifdef FMT_MID_INTERVAL
|
||||||
oss_fmt << '.' << FMT_MID_INTERVAL;
|
oss_fmt << '.' << FMT_MID_INTERVAL;
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -743,10 +819,12 @@ int main_fmtidx(int argc, char **argv)
|
||||||
bwt_idx = prefix + oss_bwt.str() + ".bwt";
|
bwt_idx = prefix + oss_bwt.str() + ".bwt";
|
||||||
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
||||||
kmer_idx = prefix + oss_kmer.str() + ".kmer";
|
kmer_idx = prefix + oss_kmer.str() + ".kmer";
|
||||||
|
full_entry_idx = prefix + oss_full_entry.str() + ".full_entry";
|
||||||
|
|
||||||
cout << bwt_idx << endl;
|
cout << bwt_idx << endl;
|
||||||
cout << fmt_idx << endl;
|
cout << fmt_idx << endl;
|
||||||
cout << kmer_idx << endl;
|
cout << kmer_idx << endl;
|
||||||
|
cout << full_entry_idx << endl;
|
||||||
|
|
||||||
// 生成或读取bwt索引文件
|
// 生成或读取bwt索引文件
|
||||||
double time_read_bwt = realtime();
|
double time_read_bwt = realtime();
|
||||||
|
|
@ -772,9 +850,9 @@ int main_fmtidx(int argc, char **argv)
|
||||||
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
|
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
|
||||||
|
|
||||||
// 生成或读取kmer信息
|
// 生成或读取kmer信息
|
||||||
vector<string> vkmer;
|
|
||||||
double time_gen_kmer = realtime();
|
double time_gen_kmer = realtime();
|
||||||
#if GEN_KMER_IDX
|
#if GEN_KMER_IDX
|
||||||
|
vector<string> vkmer;
|
||||||
gen_all_seq(vkmer, KMER_LEN);
|
gen_all_seq(vkmer, KMER_LEN);
|
||||||
fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry));
|
fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry));
|
||||||
for (size_t i = 0; i < vkmer.size(); ++i)
|
for (size_t i = 0; i < vkmer.size(); ++i)
|
||||||
|
|
@ -795,13 +873,37 @@ int main_fmtidx(int argc, char **argv)
|
||||||
time_gen_kmer = realtime() - time_gen_kmer;
|
time_gen_kmer = realtime() - time_gen_kmer;
|
||||||
cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl;
|
cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl;
|
||||||
|
|
||||||
|
// 生成或读取kmer匹配的最后单一结果
|
||||||
|
double time_gen_full_entry = realtime();
|
||||||
|
#if GEN_FULL_ENTRY_IDX
|
||||||
|
vector<string> kmer_arr;
|
||||||
|
gen_all_seq(kmer_arr, FULL_ENTRY_LEN);
|
||||||
|
fmt->full_entry = (FullEntry *)malloc(FULL_ENTRY_ARR_SIZE * sizeof(FullEntry));
|
||||||
|
for (size_t i = 0; i < kmer_arr.size(); ++i)
|
||||||
|
{
|
||||||
|
bwtintv_t p1 = fmt_search(fmt, kmer_arr[i]);
|
||||||
|
full_entry_set(&fmt->full_entry[i], p1);
|
||||||
|
bwtintv_t p2 = bwt_search(bwt, kmer_arr[i]);
|
||||||
|
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||||
|
cout << kmer_arr[i] << endl
|
||||||
|
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||||
|
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||||
|
}
|
||||||
|
dump_full_entry_idx(full_entry_idx.c_str(), fmt->full_entry);
|
||||||
|
#else
|
||||||
|
fmt->full_entry = restore_full_entry_idx(full_entry_idx.c_str());
|
||||||
|
#endif
|
||||||
|
time_gen_full_entry = realtime() - time_gen_full_entry;
|
||||||
|
cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl;
|
||||||
|
|
||||||
// 生成随机序列进行测试
|
// 生成随机序列进行测试
|
||||||
int seq_size = 10000000;
|
int seq_size = 10000000;
|
||||||
// int seq_size = 1;
|
//int seq_size = 1;
|
||||||
int seq_len = 23;
|
int seq_len = 14;
|
||||||
vector<string> seq_arr(seq_size);
|
vector<string> seq_arr(seq_size);
|
||||||
// seq_arr[0] = "GCGATACTAAGA";
|
// seq_arr[0] = "GCGATACTAAGA";
|
||||||
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
|
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
|
||||||
|
seq_arr[0] = "ACCTTTCGAATTGA";
|
||||||
#if 1
|
#if 1
|
||||||
srand(time(NULL));
|
srand(time(NULL));
|
||||||
double time_gen_seq = realtime();
|
double time_gen_seq = realtime();
|
||||||
|
|
@ -812,10 +914,17 @@ int main_fmtidx(int argc, char **argv)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 对比bwt和fmt搜索的时间
|
// 对比bwt和fmt搜索的时间
|
||||||
|
bwtint_t max_gap = 0;
|
||||||
#if CMP_FMT_BWT_TIME
|
#if CMP_FMT_BWT_TIME
|
||||||
|
// int nintv_0 = 0;
|
||||||
double time_bwt_search = realtime();
|
double time_bwt_search = realtime();
|
||||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
for (int i = 0; i < (int)seq_arr.size(); ++i) {
|
||||||
bwt_search(bwt, seq_arr[i]);
|
bwtintv_t p = bwt_search(bwt, seq_arr[i]);
|
||||||
|
if (max_gap < p.x[2])
|
||||||
|
max_gap = p.x[2];
|
||||||
|
}
|
||||||
|
// cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||||
|
cout << "max gap: " << max_gap << endl;
|
||||||
time_bwt_search = realtime() - time_bwt_search;
|
time_bwt_search = realtime() - time_bwt_search;
|
||||||
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
|
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
|
||||||
|
|
||||||
|
|
@ -831,16 +940,22 @@ int main_fmtidx(int argc, char **argv)
|
||||||
time_fmt_kmer_search = realtime() - time_fmt_kmer_search;
|
time_fmt_kmer_search = realtime() - time_fmt_kmer_search;
|
||||||
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
|
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
|
||||||
|
|
||||||
cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
|
double time_fmt_full_entry_search = realtime();
|
||||||
cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
|
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||||
cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
|
fmt_search_use_full_entry(fmt, seq_arr[i]);
|
||||||
|
time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search;
|
||||||
|
cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl;
|
||||||
|
|
||||||
|
// cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
|
||||||
|
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
|
||||||
|
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 对比bwt和fmt搜索的结果
|
// 对比bwt和fmt搜索的结果
|
||||||
#if CMP_FMT_BWT_RESULT
|
#if CMP_FMT_BWT_RESULT
|
||||||
double time_cmp = realtime();
|
double time_cmp = realtime();
|
||||||
for (int i = 0; i < (int)seq_arr.size() / 100; ++i)
|
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||||
{
|
{
|
||||||
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
||||||
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||||
|
|
|
||||||
11
fmt_index.h
11
fmt_index.h
|
|
@ -39,7 +39,7 @@
|
||||||
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
||||||
|
|
||||||
#define KMER_LEN 12
|
#define KMER_LEN 12
|
||||||
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
|
#define KMER_ARR_SIZE (1 << (KMER_LEN << 1))
|
||||||
// 用来保存kmer对应的fmt的位置信息
|
// 用来保存kmer对应的fmt的位置信息
|
||||||
struct KmerEntry
|
struct KmerEntry
|
||||||
{
|
{
|
||||||
|
|
@ -47,6 +47,14 @@ struct KmerEntry
|
||||||
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define FULL_ENTRY_LEN 14
|
||||||
|
#define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1))
|
||||||
|
// kmer全部匹配对应的最后fmt匹配结果
|
||||||
|
struct FullEntry
|
||||||
|
{
|
||||||
|
uint8_t intv_arr[14];
|
||||||
|
};
|
||||||
|
|
||||||
// fm-index, extend twice in one search step (one memory access)
|
// fm-index, extend twice in one search step (one memory access)
|
||||||
struct FMTIndex
|
struct FMTIndex
|
||||||
{
|
{
|
||||||
|
|
@ -63,6 +71,7 @@ struct FMTIndex
|
||||||
uint8_t last_base; // dollar转换成的base
|
uint8_t last_base; // dollar转换成的base
|
||||||
// 保存kmer对应的fmt位置信息
|
// 保存kmer对应的fmt位置信息
|
||||||
KmerEntry *kmer_entry;
|
KmerEntry *kmer_entry;
|
||||||
|
FullEntry *full_entry;
|
||||||
// suffix array
|
// suffix array
|
||||||
int sa_intv;
|
int sa_intv;
|
||||||
bwtint_t n_sa;
|
bwtint_t n_sa;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue