添加了kmer_bit,用来检测特定长度的kmer有没有对应的fm-index匹配

This commit is contained in:
zzh 2024-02-13 11:39:25 +08:00
parent 1b77566567
commit b0cfa3630e
4 changed files with 408 additions and 35 deletions

2
.vscode/launch.json vendored
View File

@ -8,7 +8,7 @@
"name": "fmtidx",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/fmtidx",
"program": "${workspaceRoot}/bwa_perf",
"args": [
"/home/zzh/reference/human_g1k_v37_decoy.fasta"
],

4
bwt.h
View File

@ -61,4 +61,8 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2);
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
#endif

View File

@ -195,6 +195,46 @@ FullEntry *restore_full_entry_idx(const char *fn)
err_fclose(fp);
return full_entry;
}
// 将full kmer hash数据写入到文件
void dump_xmer_idx(const char *fn, const XmerHash *xh)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(xh->xe10, 1, (1 << (10 << 1)) * sizeof(XmerEntryArr), fp);
err_fwrite(xh->xe11, 1, (1 << (11 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe12, 1, (1 << (12 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe13, 1, (1 << (13 << 1)) * sizeof(XmerEntry), fp);
err_fwrite(xh->xe14, 1, (1 << (14 << 1)) * sizeof(XmerEntry), fp);
err_fflush(fp);
err_fclose(fp);
}
// 从文件中读取xmer hash信息
XmerHash restore_xmer_idx(const char *fn)
{
FILE *fp = xopen(fn, "rb");
XmerHash xhash;
XmerHash *xh = &xhash;
int len = 1 << (10 << 1);
xh->xe10 = (XmerEntryArr *)malloc(len * sizeof(XmerEntryArr));
fread_fix(fp, len * sizeof(XmerEntryArr), xh->xe10);
len = 1 << (11 << 1);
xh->xe11 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe11);
len = 1 << (12 << 1);
xh->xe12 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe12);
len = 1 << (13 << 1);
xh->xe13 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe13);
len = 1 << (14 << 1);
xh->xe14 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
fread_fix(fp, len * sizeof(XmerEntry), xh->xe14);
err_fclose(fp);
return xhash;
}
// 从文件中读取fmt结构数据
FMTIndex *restore_fmt(const char *fn)
{
@ -398,7 +438,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
}
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
{
uint32_t x = 0;
uint32_t *p, *q, tmp;
@ -694,7 +734,6 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
}
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
//t1 += realtime() - tt;
// tt = realtime();
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
@ -726,10 +765,10 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
}
// 利用fmt搜索seed利用full entry加速搜索过程
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, bwt_t *bwt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
@ -742,6 +781,244 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
}
full_entry_get(&fmt->full_entry[qbit], &ik);
#if 0
bwtintv_t ok;
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
if (i < qlen && bval(q[i]) < 4)
{ // 最后一次扩展
c1 = cbval(q[i]);
fmt_extend1(fmt, &ik, &ok, 0, c1);
ik = ok;
ik.info = i + 1;
}
#else
bwtintv_t ok[4];
for (i = x; i < qlen; ++i)
{
if (bval(q[i]) < 4)
{
c1 = cbval(q[i]);
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c1];
ik.info = i + 1;
}
}
#endif
return ik;
}
// 将用字符串表示的序列计算成用2bit表示的序列
uint32_t str2bit(string &str)
{
uint32_t pac = 0;
for (int i = 0; i < 16; ++i)
pac = (pac << 2) | bval(str[i]);
return pac;
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(vector<string> &vseq, int kmer_len)
{
uint64_t seq_up_val = (1 << (kmer_len << 1));
vseq.clear();
vseq.resize(seq_up_val);
for (uint64_t i = 0; i < seq_up_val; ++i)
{
for (int j = kmer_len - 1; j >= 0; --j)
{
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
}
}
}
// 获取xmer的fmt匹配信息
inline void xmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos = 0)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 设置xmer第pos个碱基对应的fmt匹配信息
inline void xmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos = 0)
{
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
// 创建xmer hash索引
void create_xmer_index(FMTIndex *fmt)
{
vector<string> qarr;
gen_all_seq(qarr, 10);
bwtintv_t ik;
int i, j, c1, c2;
int qlen = 10;
bwtint_t tk[4], tl[4];
XmerHash *xh = &fmt->xmer_hash;
cout << "qlen: " << qlen << endl;
xh->xe10 = (XmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(XmerEntryArr));
xh->xe11 = (XmerEntry *)malloc((1 << (11 << 1)) * sizeof(XmerEntry));
xh->xe12 = (XmerEntry *)malloc((1 << (12 << 1)) * sizeof(XmerEntry));
xh->xe13 = (XmerEntry *)malloc((1 << (13 << 1)) * sizeof(XmerEntry));
xh->xe14 = (XmerEntry *)malloc((1 << (14 << 1)) * sizeof(XmerEntry));
// 长度为10的kmer
for (j = 0; j < (int)qarr.size(); ++j)
{
string &q = qarr[j];
uint8_t *mem_addr = xh->xe10[j].intv_arr;
fmt_set_intv(fmt, bval(q[0]), ik);
xmer_setval_at(mem_addr, ik, 0);
// 每次扩展两个碱基
for (i = 1; i + 1 < qlen; i += 2)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
xmer_setval_at(mem_addr, ik, i);
// 第二次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
ik.x[2] = tl[3] - tk[3];
xmer_setval_at(mem_addr, ik, i + 1);
}
{ // 最后一次扩展
c1 = cbval(q[i]);
c2 = 3;
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
// 第一次扩展的结果
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
ik.x[2] = tl[1] - tk[1];
xmer_setval_at(mem_addr, ik, i);
}
}
// 长度11的kmer
gen_all_seq(qarr, 11);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe11[j].intv_arr, p);
}
// 长度12的kmer
gen_all_seq(qarr, 12);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe12[j].intv_arr, p);
}
//string q = "CACATATTGGCG";
//uint32_t qbit = 0;
//for (i = 0; i < 12; ++i)
//{
// qbit |= bval(q[i]) << ((11 - i) << 1);
//}
//cout << "qbit: " << qbit << endl;
//xmer_getval_at(fmt->xmer_hash.xe12[qbit].intv_arr, &ik);
// 长度13的kmer
gen_all_seq(qarr, 13);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe13[j].intv_arr, p);
}
// 长度14的kmer
gen_all_seq(qarr, 14);
for (j = 0; j < (int)qarr.size(); ++j)
{
bwtintv_t p = fmt_search(fmt, qarr[j]);
xmer_setval_at(fmt->xmer_hash.xe14[j].intv_arr, p);
}
}
// 利用fmt搜索seed利用xmer加速搜索过程
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q)
{
bwtintv_t ik;
bwtintv_t ok;
int i, c1, c2, x = 0;
int qlen = (int)q.size();
int max_xmer_len = 14;
// 先利用kmer进行索引查询
uint32_t qbit = 0;
x = min(max_xmer_len, qlen);
for (i = 0; i < x; ++i)
{
qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
}
if (x == 14)
{
xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
}
else if (x == 13)
{
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
}
else if (x == 12)
{
// cout << "qbit: " << qbit << endl;
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
}
else if (x == 11)
{
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
}
else
{
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
}
// 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2)
{
@ -768,43 +1045,70 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
return ik;
}
// 将用字符串表示的序列计算成用2bit表示的序列
uint32_t str2bit(string &str)
uint16_t *restore_kmer_bit(const char *fn, int kmer_len)
{
uint32_t pac = 0;
for (int i = 0; i < 16; ++i)
pac = (pac << 2) | bval(str[i]);
return pac;
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
FILE *fp;
fp = xopen(fn, "rb");
fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
err_fclose(fp);
return kbit;
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(vector<string> &vseq, int kmer_len)
void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
vseq.clear();
vseq.resize(seq_up_val);
for (uint32_t i = 0; i < seq_up_val; ++i)
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
ostringstream oss;
oss << '.' << kmer_len << ".kmer.bit";
string kmer_bit = prefix + oss.str();
cout << kmer_bit << endl;
uint64_t seq_up_val = (1L << (kmer_len << 1));
cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl;
string seq;
seq.resize(kmer_len);
int percent = 1;
double tt = realtime();
for (uint64_t i = 0; i < seq_up_val; ++i)
{
if ((i+1) % (seq_up_val / 100) == 0)
{
tt = realtime() - tt;
cout << percent++ << "\% finished, " << tt << "s elapsed." << endl;
tt = realtime();
}
for (int j = kmer_len - 1; j >= 0; --j)
{
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
uint32_t pos = i >> 4;
uint32_t bit_pos = i & 0xf;
bwtintv_t p = fmt_search_use_xmer(fmt, seq);
if (p.x[2] > 0) {
kbit[pos] |= 1 << bit_pos;
}
}
FILE *fp;
fp = xopen(kmer_bit.c_str(), "wb");
err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp);
err_fflush(fp);
err_fclose(fp);
}
#define GEN_BWT_IDX 0
#define GEN_FMT_IDX 0
#define GEN_KMER_IDX 0
#define GEN_XMER_IDX 0
#define GEN_FULL_ENTRY_IDX 0
#define CMP_FMT_BWT_TIME 1
#define CMP_FMT_BWT_TIME 0
#define CMP_FMT_BWT_RESULT 1
// argv[1] 应该是索引的前缀
int main_fmtidx(int argc, char **argv)
{
//cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl;
string prefix = argv[1];
string bwt_str = prefix + ".bwt.str";
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx;
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx, xmer_idx;
bwt_t *bwt;
FMTIndex *fmt;
ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry;
@ -820,11 +1124,14 @@ int main_fmtidx(int argc, char **argv)
fmt_idx = prefix + oss_fmt.str() + ".fmt";
kmer_idx = prefix + oss_kmer.str() + ".kmer";
full_entry_idx = prefix + oss_full_entry.str() + ".full_entry";
xmer_idx = prefix + ".14.xmer";
cout << bwt_idx << endl;
cout << fmt_idx << endl;
cout << kmer_idx << endl;
cout << full_entry_idx << endl;
cout << xmer_idx << endl;
// 生成或读取bwt索引文件
double time_read_bwt = realtime();
@ -896,14 +1203,29 @@ int main_fmtidx(int argc, char **argv)
time_gen_full_entry = realtime() - time_gen_full_entry;
cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl;
// 生成xmer索引文件
double time_gen_xmer_hash = realtime();
#if GEN_XMER_IDX
create_xmer_index(fmt);
dump_xmer_idx(xmer_idx.c_str(), &fmt->xmer_hash);
#else
fmt->xmer_hash = restore_xmer_idx(xmer_idx.c_str());
#endif
time_gen_xmer_hash = realtime() - time_gen_xmer_hash;
cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl;
int kmer_bit_len = 18;
//create_kmer_bit(fmt, prefix, kmer_bit_len);
// return 0;
// 生成随机序列进行测试
int seq_size = 10000000;
//int seq_size = 1;
int seq_len = 14;
int seq_len = kmer_bit_len;
vector<string> seq_arr(seq_size);
// seq_arr[0] = "GCGATACTAAGA";
seq_arr[0] = "CACATATTGGCG";
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
seq_arr[0] = "ACCTTTCGAATTGA";
// seq_arr[0] = "ACCTTTCGAATTGA";
#if 1
srand(time(NULL));
double time_gen_seq = realtime();
@ -914,17 +1236,20 @@ int main_fmtidx(int argc, char **argv)
#endif
// 对比bwt和fmt搜索的时间
bwtint_t max_gap = 0;
#if CMP_FMT_BWT_TIME
// int nintv_0 = 0;
int nintv_0 = 0;
//bwtint_t max_gap = 0;
double time_bwt_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i) {
bwtintv_t p = bwt_search(bwt, seq_arr[i]);
if (max_gap < p.x[2])
max_gap = p.x[2];
//if (max_gap < p.x[2])
// max_gap = p.x[2];
if (p.x[2] == 0)
++nintv_0;
}
// cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
cout << "max gap: " << max_gap << endl;
cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
//cout << "max gap: " << max_gap << endl;
time_bwt_search = realtime() - time_bwt_search;
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
@ -942,15 +1267,23 @@ int main_fmtidx(int argc, char **argv)
double time_fmt_full_entry_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_full_entry(fmt, seq_arr[i]);
fmt_search_use_full_entry(fmt, bwt, seq_arr[i]);
time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search;
cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl;
double time_xmer_search = realtime();
for (int i = 0; i < (int)seq_arr.size(); ++i)
fmt_search_use_xmer(fmt, seq_arr[i]);
time_xmer_search = realtime() - time_xmer_search;
cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
#endif
// cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
#endif
ostringstream oss_kmer_bit;
oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit";
uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len);
// 对比bwt和fmt搜索的结果
#if CMP_FMT_BWT_RESULT
@ -959,11 +1292,25 @@ int main_fmtidx(int argc, char **argv)
{
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
// bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]);
// bwtintv_t p2 = fmt_search_use_full_entry(fmt, seq_arr[i]);
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
cout << seq_arr[i] << endl
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
uint64_t qbit = 0;
for (int k = 0; k < kmer_bit_len; ++k)
{
uint64_t base = bval(seq_arr[i][k]);
qbit |= base << ((kmer_bit_len - 1 - k) << 1);
}
uint32_t pos = qbit >> 4;
uint32_t bit_pos = qbit & 0xf;
//cout << pos << '\t' << bit_pos << '\t' << kbit[pos] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
if ((p2.x[2] >0) != ((kbit[pos] & (1 << bit_pos)) > 0))
cout << "diff: " << seq_arr[i] << '\t' << p2.x[2] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
}
time_cmp = realtime() - time_cmp;
cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl;

View File

@ -55,6 +55,27 @@ struct FullEntry
uint8_t intv_arr[14];
};
// x-mer entry, 1 - 14个碱基组成的hash key
struct XmerEntry
{
uint8_t intv_arr[14];
};
struct XmerEntryArr
{
uint8_t intv_arr[140];
};
struct XmerHash
{
XmerEntryArr *xe10;
XmerEntry *xe11;
XmerEntry *xe12;
XmerEntry *xe13;
XmerEntry *xe14;
};
// fm-index, extend twice in one search step (one memory access)
struct FMTIndex
{
@ -72,6 +93,7 @@ struct FMTIndex
// 保存kmer对应的fmt位置信息
KmerEntry *kmer_entry;
FullEntry *full_entry;
XmerHash xmer_hash;
// suffix array
int sa_intv;
bwtint_t n_sa;