添加了kmer index,有效果,但是不明显,有可能是随着序列的边长,后边的扩展更耗时

This commit is contained in:
zzh 2024-02-03 10:06:51 +08:00
parent 5a055625c3
commit 914cbd34ab
6 changed files with 251 additions and 18 deletions

View File

@ -246,7 +246,7 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
}
// 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q)
bwtintv_t bwt_search(bwt_t *bwt, const string &q)
{
bwtintv_t ik, ok[4];
int i, c, x = 0;
@ -264,6 +264,7 @@ void bwt_search(bwt_t *bwt, const string &q)
ik.info = i + 1;
}
}
return ik;
}
// 扩展两次

2
bwt.h
View File

@ -57,7 +57,7 @@ bwt_t *restore_bwt(const char *fn);
void create_interval_occ_bwt(bwt_t *bwt);
void dump_bwt(const char *fn, const bwt_t *bwt);
// 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q);
bwtintv_t bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);

View File

@ -138,6 +138,7 @@ void fmt_gen_cnt_table(uint32_t cnt_table[4][256])
}
}
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt)
{
FILE *fp;
@ -153,25 +154,47 @@ void dump_fmt(const char *fn, const FMTIndex *fmt)
err_fclose(fp);
}
// 将kmer hash数据写入到文件
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取kmer hash信息
KmerEntry *restore_kmer_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry);
KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes);
fread_fix(fp, kmer_bytes, kmer_entry);
err_fclose(fp);
return kmer_entry;
}
// 从文件中读取fmt结构数据
FMTIndex *restore_fmt(const char *fn)
{
FMTIndex *fmt;
fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex));
FILE *fp = fopen(fn, "rb");
FILE *fp = xopen(fn, "rb");
fseek(fp, 0, SEEK_END);
fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size
fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4);
fseek(fp, 0, SEEK_SET);
fread(&fmt->primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
fread(&fmt->first_base, sizeof(uint8_t), 1, fp);
fread(&fmt->last_base, sizeof(uint8_t), 1, fp);
fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fread_noeof(&fmt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&fmt->sec_bcp, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->first_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(&fmt->last_base, sizeof(uint8_t), 1, fp);
err_fread_noeof(fmt->L2 + 1, sizeof(bwtint_t), 4, fp);
fread_fix(fp, fmt->bwt_size << 2, fmt->bwt);
fmt->seq_len = fmt->L2[4];
fclose(fp);
err_fclose(fp);
fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量
return fmt;
}
@ -520,6 +543,138 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
return ik;
}
// 获取kmer的fmt匹配信息
inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = ke->intv_arr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 设置kmer第pos个碱基对应的fmt匹配信息
inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos)
{
int byte_idx = pos * 14;
uint8_t *arr = ke->intv_arr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke)
{
bwtintv_t ik;
int i, c1, c2;
int qlen = (int)q.size();
bwtint_t tk[4], tl[4];
fmt_set_intv(fmt, bval(q[0]), ik);
kmer_setval_at(&ke, ik, 0);
// 每次扩展两个碱基
for (i = 1; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(&ke, ik, i);
// 第二次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
ik.x[2] = tl[3] - tk[3];
kmer_setval_at(&ke, ik, i + 1);
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
c2 = 3;
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
kmer_setval_at(&ke, ik, i);
}
}
// 利用fmt搜索seed利用kmer加速搜索过程
bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
// 先利用kmer进行索引查询
uint32_t qbit = 0;
x = min(KMER_LEN, qlen);
double tt = realtime();
for (i = 0; i < x; ++i)
{
qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1);
}
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
t1 += realtime() - tt;
tt = realtime();
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
}
t2 += realtime() - tt;
return ik;
}
// 将用字符串表示的序列计算成用2bit表示的序列
uint32_t str2bit(string &str)
{
uint32_t pac = 0;
@ -528,8 +683,24 @@ uint32_t str2bit(string &str)
return pac;
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(vector<string> &vseq, int kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
vseq.clear();
vseq.resize(seq_up_val);
for (uint32_t i = 0; i < seq_up_val; ++i)
{
for (int j = kmer_len - 1; j >= 0; --j)
{
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
}
}
}
#define GEN_BWT_IDX 0
#define GEN_FMT_IDX 0
#define GEN_KMER_IDX 0
#define CMP_FMT_BWT_TIME 1
#define CMP_FMT_BWT_RESULT 1
@ -538,21 +709,24 @@ int main_fmtidx(int argc, char **argv)
{
string prefix = argv[1];
string bwt_str = prefix + ".bwt.str";
string bwt_idx, fmt_idx;
string bwt_idx, fmt_idx, kmer_idx;
bwt_t *bwt;
FMTIndex *fmt;
ostringstream oss_bwt, oss_fmt;
ostringstream oss_bwt, oss_fmt, oss_kmer;
oss_bwt << '.' << OCC_INTERVAL;
oss_fmt << '.' << FMT_OCC_INTERVAL;
oss_kmer << '.' << KMER_LEN;
#ifdef FMT_MID_INTERVAL
oss_fmt << '.' << FMT_MID_INTERVAL;
#endif
bwt_idx = prefix + oss_bwt.str() + ".bwt";
fmt_idx = prefix + oss_fmt.str() + ".fmt";
kmer_idx = prefix + oss_kmer.str() + ".kmer";
cout << bwt_idx << endl;
cout << fmt_idx << endl;
cout << kmer_idx << endl;
// 生成或读取bwt索引文件
double time_read_bwt = realtime();
@ -577,10 +751,34 @@ int main_fmtidx(int argc, char **argv)
time_read_fmt = realtime() - time_read_fmt;
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
// 生成随机序列进行测试
// 生成或读取kmer信息
vector<string> vkmer;
double time_gen_kmer = realtime();
#if GEN_KMER_IDX
gen_all_seq(vkmer, KMER_LEN);
fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry));
for (size_t i = 0; i < vkmer.size(); ++i)
{
fmt_search_store_kmer(fmt, vkmer[i], fmt->kmer_entry[i]);
bwtintv_t p1;
kmer_getval_at(&fmt->kmer_entry[i], &p1, KMER_LEN - 1);
bwtintv_t p2 = bwt_search(bwt, vkmer[i]);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
cout << vkmer[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
}
dump_kmer_idx(kmer_idx.c_str(), fmt->kmer_entry);
#else
fmt->kmer_entry = restore_kmer_idx(kmer_idx.c_str());
#endif
time_gen_kmer = realtime() - time_gen_kmer;
cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl;
// 生成随机序列进行测试
int seq_size = 10000000;
// int seq_size = 1;
int seq_len = 11;
int seq_len = 19;
vector<string> seq_arr(seq_size);
// seq_arr[0] = "GCGATACTAAGA";
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
@ -606,15 +804,25 @@ int main_fmtidx(int argc, char **argv)
fmt_search(fmt, seq_arr[i]);
time_fmt_search = realtime() - time_fmt_search;
cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
double time_fmt_kmer_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_kmer(fmt, seq_arr[i]);
time_fmt_kmer_search = realtime() - time_fmt_kmer_search;
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
#endif
// 对比bwt和fmt搜索的结果
#if CMP_FMT_BWT_RESULT
double time_cmp = realtime();
double time_cmp = realtime();
for (int i = 0; i < (int)seq_arr.size() / 100; ++i)
{
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
cout << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
@ -625,4 +833,4 @@ int main_fmtidx(int argc, char **argv)
#endif
return 0;
}
}

View File

@ -5,7 +5,7 @@
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 5
#define FMT_MID_INTV_SHIFT 4
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
@ -38,6 +38,15 @@
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
#define KMER_LEN 10
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
// 用来保存kmer对应的fmt的位置信息
struct KmerEntry
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
};
// fm-index, extend twice in one search step (one memory access)
struct FMTIndex
{
@ -52,6 +61,8 @@ struct FMTIndex
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// 保存kmer对应的fmt位置信息
KmerEntry *kmer_entry;
// suffix array
int sa_intv;
bwtint_t n_sa;

View File

@ -115,6 +115,17 @@ size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
return ret;
}
// 读取单个数据
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fread(ptr, size, nmemb, stream);
if (ret != nmemb)
{
_err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
}
return ret;
}
// 刷新文件流
int err_fflush(FILE *stream)
{

2
util.h
View File

@ -23,6 +23,8 @@ struct bwtintv_t
bwtint_t x[3], info; // x[0]表示正链位置x[1]表示互补链位置x[2]表示间隔长度info 表示read的起始结束位置
};
// 读取单个数据
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
// 读取二进制数据
bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a);
// 给出问题信息并终止程序