添加了kmer index,有效果,但是不明显,有可能是随着序列的边长,后边的扩展更耗时
This commit is contained in:
parent
5a055625c3
commit
914cbd34ab
3
bwt.cpp
3
bwt.cpp
|
|
@ -246,7 +246,7 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
|||
}
|
||||
|
||||
// 利用bwt搜索seed,完整搜索,只需要单向搜索
|
||||
void bwt_search(bwt_t *bwt, const string &q)
|
||||
bwtintv_t bwt_search(bwt_t *bwt, const string &q)
|
||||
{
|
||||
bwtintv_t ik, ok[4];
|
||||
int i, c, x = 0;
|
||||
|
|
@ -264,6 +264,7 @@ void bwt_search(bwt_t *bwt, const string &q)
|
|||
ik.info = i + 1;
|
||||
}
|
||||
}
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 扩展两次
|
||||
|
|
|
|||
2
bwt.h
2
bwt.h
|
|
@ -57,7 +57,7 @@ bwt_t *restore_bwt(const char *fn);
|
|||
void create_interval_occ_bwt(bwt_t *bwt);
|
||||
void dump_bwt(const char *fn, const bwt_t *bwt);
|
||||
// 利用bwt搜索seed,完整搜索,只需要单向搜索
|
||||
void bwt_search(bwt_t *bwt, const string &q);
|
||||
bwtintv_t bwt_search(bwt_t *bwt, const string &q);
|
||||
// 每次扩展两步
|
||||
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
|
||||
|
||||
|
|
|
|||
238
fmt_index.cpp
238
fmt_index.cpp
|
|
@ -138,6 +138,7 @@ void fmt_gen_cnt_table(uint32_t cnt_table[4][256])
|
|||
}
|
||||
}
|
||||
|
||||
// 将fmt结构数据写入到二进制文件
|
||||
void dump_fmt(const char *fn, const FMTIndex *fmt)
|
||||
{
|
||||
FILE *fp;
|
||||
|
|
@ -153,25 +154,47 @@ void dump_fmt(const char *fn, const FMTIndex *fmt)
|
|||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 将kmer hash数据写入到文件
|
||||
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 从文件中读取kmer hash信息
|
||||
KmerEntry *restore_kmer_idx(const char *fn)
|
||||
{
|
||||
FILE *fp = xopen(fn, "rb");
|
||||
uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry);
|
||||
KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes);
|
||||
fread_fix(fp, kmer_bytes, kmer_entry);
|
||||
err_fclose(fp);
|
||||
return kmer_entry;
|
||||
}
|
||||
|
||||
// 从文件中读取fmt结构数据
|
||||
FMTIndex *restore_fmt(const char *fn)
|
||||
{
|
||||
FMTIndex *fmt;
|
||||
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
|
||||
FILE *fp = fopen(fn, "rb");
|
||||
FILE *fp = xopen(fn, "rb");
|
||||
|
||||
fseek(fp, 0, SEEK_END);
|
||||
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
|
||||
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
fread(&fmt->primary, sizeof(bwtint_t), 1, fp);
|
||||
fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
|
||||
fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
|
||||
fread(&fmt->first_base, sizeof(uint8_t), 1, fp);
|
||||
fread(&fmt->last_base, sizeof(uint8_t), 1, fp);
|
||||
fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||
err_fread_noeof(&fmt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(&fmt->first_base, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(&fmt->last_base, sizeof(uint8_t), 1, fp);
|
||||
err_fread_noeof(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
|
||||
fmt->seq_len = fmt->L2[4];
|
||||
fclose(fp);
|
||||
err_fclose(fp);
|
||||
fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
|
||||
return fmt;
|
||||
}
|
||||
|
|
@ -520,6 +543,138 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
|
|||
return ik;
|
||||
}
|
||||
|
||||
// 获取kmer的fmt匹配信息
|
||||
inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = ke->intv_arr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos)
|
||||
{
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = ke->intv_arr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
|
||||
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
int i, c1, c2;
|
||||
int qlen = (int)q.size();
|
||||
bwtint_t tk[4], tl[4];
|
||||
|
||||
fmt_set_intv(fmt, bval(q[0]), ik);
|
||||
kmer_setval_at(&ke, ik, 0);
|
||||
|
||||
// 每次扩展两个碱基
|
||||
for (i = 1; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(&ke, ik, i);
|
||||
|
||||
// 第二次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
|
||||
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
|
||||
ik.x[2] = tl[3] - tk[3];
|
||||
kmer_setval_at(&ke, ik, i + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
c2 = 3;
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(&ke, ik, i);
|
||||
}
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,利用kmer加速搜索过程
|
||||
bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok;
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
// 先利用kmer进行索引查询
|
||||
uint32_t qbit = 0;
|
||||
x = min(KMER_LEN, qlen);
|
||||
double tt = realtime();
|
||||
for (i = 0; i < x; ++i)
|
||||
{
|
||||
qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1);
|
||||
}
|
||||
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
|
||||
t1 += realtime() - tt;
|
||||
|
||||
tt = realtime();
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
t2 += realtime() - tt;
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 将用字符串表示的序列计算成用2bit表示的序列
|
||||
uint32_t str2bit(string &str)
|
||||
{
|
||||
uint32_t pac = 0;
|
||||
|
|
@ -528,8 +683,24 @@ uint32_t str2bit(string &str)
|
|||
return pac;
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(vector<string> &vseq, int kmer_len)
|
||||
{
|
||||
uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
vseq.clear();
|
||||
vseq.resize(seq_up_val);
|
||||
for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define GEN_BWT_IDX 0
|
||||
#define GEN_FMT_IDX 0
|
||||
#define GEN_KMER_IDX 0
|
||||
#define CMP_FMT_BWT_TIME 1
|
||||
#define CMP_FMT_BWT_RESULT 1
|
||||
|
||||
|
|
@ -538,21 +709,24 @@ int main_fmtidx(int argc, char **argv)
|
|||
{
|
||||
string prefix = argv[1];
|
||||
string bwt_str = prefix + ".bwt.str";
|
||||
string bwt_idx, fmt_idx;
|
||||
string bwt_idx, fmt_idx, kmer_idx;
|
||||
bwt_t *bwt;
|
||||
FMTIndex *fmt;
|
||||
ostringstream oss_bwt, oss_fmt;
|
||||
ostringstream oss_bwt, oss_fmt, oss_kmer;
|
||||
oss_bwt << '.' << OCC_INTERVAL;
|
||||
oss_fmt << '.' << FMT_OCC_INTERVAL;
|
||||
oss_kmer << '.' << KMER_LEN;
|
||||
#ifdef FMT_MID_INTERVAL
|
||||
oss_fmt << '.' << FMT_MID_INTERVAL;
|
||||
#endif
|
||||
|
||||
bwt_idx = prefix + oss_bwt.str() + ".bwt";
|
||||
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
||||
kmer_idx = prefix + oss_kmer.str() + ".kmer";
|
||||
|
||||
cout << bwt_idx << endl;
|
||||
cout << fmt_idx << endl;
|
||||
cout << kmer_idx << endl;
|
||||
|
||||
// 生成或读取bwt索引文件
|
||||
double time_read_bwt = realtime();
|
||||
|
|
@ -577,10 +751,34 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_read_fmt = realtime() - time_read_fmt;
|
||||
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
|
||||
|
||||
// 生成随机序列进行测试
|
||||
// 生成或读取kmer信息
|
||||
vector<string> vkmer;
|
||||
double time_gen_kmer = realtime();
|
||||
#if GEN_KMER_IDX
|
||||
gen_all_seq(vkmer, KMER_LEN);
|
||||
fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry));
|
||||
for (size_t i = 0; i < vkmer.size(); ++i)
|
||||
{
|
||||
fmt_search_store_kmer(fmt, vkmer[i], fmt->kmer_entry[i]);
|
||||
bwtintv_t p1;
|
||||
kmer_getval_at(&fmt->kmer_entry[i], &p1, KMER_LEN - 1);
|
||||
bwtintv_t p2 = bwt_search(bwt, vkmer[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << vkmer[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
}
|
||||
dump_kmer_idx(kmer_idx.c_str(), fmt->kmer_entry);
|
||||
#else
|
||||
fmt->kmer_entry = restore_kmer_idx(kmer_idx.c_str());
|
||||
#endif
|
||||
time_gen_kmer = realtime() - time_gen_kmer;
|
||||
cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl;
|
||||
|
||||
// 生成随机序列进行测试
|
||||
int seq_size = 10000000;
|
||||
// int seq_size = 1;
|
||||
int seq_len = 11;
|
||||
int seq_len = 19;
|
||||
vector<string> seq_arr(seq_size);
|
||||
// seq_arr[0] = "GCGATACTAAGA";
|
||||
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
|
||||
|
|
@ -606,15 +804,25 @@ int main_fmtidx(int argc, char **argv)
|
|||
fmt_search(fmt, seq_arr[i]);
|
||||
time_fmt_search = realtime() - time_fmt_search;
|
||||
cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
|
||||
|
||||
double time_fmt_kmer_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
time_fmt_kmer_search = realtime() - time_fmt_kmer_search;
|
||||
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
|
||||
|
||||
cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
|
||||
|
||||
#endif
|
||||
|
||||
// 对比bwt和fmt搜索的结果
|
||||
#if CMP_FMT_BWT_RESULT
|
||||
double time_cmp = realtime();
|
||||
double time_cmp = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size() / 100; ++i)
|
||||
{
|
||||
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << seq_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
|
|
@ -625,4 +833,4 @@ int main_fmtidx(int argc, char **argv)
|
|||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
13
fmt_index.h
13
fmt_index.h
|
|
@ -5,7 +5,7 @@
|
|||
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
|
||||
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
|
||||
|
||||
#define FMT_MID_INTV_SHIFT 5
|
||||
#define FMT_MID_INTV_SHIFT 4
|
||||
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
|
||||
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
|
||||
|
||||
|
|
@ -38,6 +38,15 @@
|
|||
#define __fmt_mid_sum(x) \
|
||||
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
||||
|
||||
#define KMER_LEN 10
|
||||
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
|
||||
// 用来保存kmer对应的fmt的位置信息
|
||||
struct KmerEntry
|
||||
{
|
||||
// 40+40+32 14个byte,这样好处理
|
||||
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||
};
|
||||
|
||||
// fm-index, extend twice in one search step (one memory access)
|
||||
struct FMTIndex
|
||||
{
|
||||
|
|
@ -52,6 +61,8 @@ struct FMTIndex
|
|||
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
|
||||
uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3
|
||||
uint8_t last_base; // dollar转换成的base
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerEntry *kmer_entry;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
|
|
|
|||
11
util.cpp
11
util.cpp
|
|
@ -115,6 +115,17 @@ size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
|
|||
return ret;
|
||||
}
|
||||
|
||||
// 读取单个数据
|
||||
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
|
||||
{
|
||||
size_t ret = fread(ptr, size, nmemb, stream);
|
||||
if (ret != nmemb)
|
||||
{
|
||||
_err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// 刷新文件流
|
||||
int err_fflush(FILE *stream)
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue