添加了kmer_bit,用来检测特定长度的kmer有没有对应的fm-index匹配
This commit is contained in:
parent
1b77566567
commit
b0cfa3630e
|
|
@ -8,7 +8,7 @@
|
|||
"name": "fmtidx",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/fmtidx",
|
||||
"program": "${workspaceRoot}/bwa_perf",
|
||||
"args": [
|
||||
"/home/zzh/reference/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
|
|
|
|||
4
bwt.h
4
bwt.h
|
|
@ -61,4 +61,8 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q);
|
|||
// 每次扩展两步
|
||||
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
|
||||
|
||||
void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2);
|
||||
|
||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
|
||||
|
||||
#endif
|
||||
415
fmt_index.cpp
415
fmt_index.cpp
|
|
@ -195,6 +195,46 @@ FullEntry *restore_full_entry_idx(const char *fn)
|
|||
err_fclose(fp);
|
||||
return full_entry;
|
||||
}
|
||||
|
||||
// 将full kmer hash数据写入到文件
|
||||
void dump_xmer_idx(const char *fn, const XmerHash *xh)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(xh->xe10, 1, (1 << (10 << 1)) * sizeof(XmerEntryArr), fp);
|
||||
err_fwrite(xh->xe11, 1, (1 << (11 << 1)) * sizeof(XmerEntry), fp);
|
||||
err_fwrite(xh->xe12, 1, (1 << (12 << 1)) * sizeof(XmerEntry), fp);
|
||||
err_fwrite(xh->xe13, 1, (1 << (13 << 1)) * sizeof(XmerEntry), fp);
|
||||
err_fwrite(xh->xe14, 1, (1 << (14 << 1)) * sizeof(XmerEntry), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
// 从文件中读取xmer hash信息
|
||||
XmerHash restore_xmer_idx(const char *fn)
|
||||
{
|
||||
FILE *fp = xopen(fn, "rb");
|
||||
XmerHash xhash;
|
||||
XmerHash *xh = &xhash;
|
||||
int len = 1 << (10 << 1);
|
||||
xh->xe10 = (XmerEntryArr *)malloc(len * sizeof(XmerEntryArr));
|
||||
fread_fix(fp, len * sizeof(XmerEntryArr), xh->xe10);
|
||||
len = 1 << (11 << 1);
|
||||
xh->xe11 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
|
||||
fread_fix(fp, len * sizeof(XmerEntry), xh->xe11);
|
||||
len = 1 << (12 << 1);
|
||||
xh->xe12 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
|
||||
fread_fix(fp, len * sizeof(XmerEntry), xh->xe12);
|
||||
len = 1 << (13 << 1);
|
||||
xh->xe13 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
|
||||
fread_fix(fp, len * sizeof(XmerEntry), xh->xe13);
|
||||
len = 1 << (14 << 1);
|
||||
xh->xe14 = (XmerEntry *)malloc(len * sizeof(XmerEntry));
|
||||
fread_fix(fp, len * sizeof(XmerEntry), xh->xe14);
|
||||
err_fclose(fp);
|
||||
return xhash;
|
||||
}
|
||||
|
||||
// 从文件中读取fmt结构数据
|
||||
FMTIndex *restore_fmt(const char *fn)
|
||||
{
|
||||
|
|
@ -398,7 +438,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
|||
}
|
||||
|
||||
// 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ
|
||||
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
|
||||
inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4])
|
||||
{
|
||||
uint32_t x = 0;
|
||||
uint32_t *p, *q, tmp;
|
||||
|
|
@ -694,9 +734,8 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
|||
}
|
||||
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
|
||||
//t1 += realtime() - tt;
|
||||
|
||||
//tt = realtime();
|
||||
// 每次扩展两个碱基
|
||||
// tt = realtime();
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
|
|
@ -726,10 +765,10 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
|||
}
|
||||
|
||||
// 利用fmt搜索seed,利用full entry加速搜索过程
|
||||
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
|
||||
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok;
|
||||
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
|
|
@ -742,6 +781,244 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
|
|||
}
|
||||
full_entry_get(&fmt->full_entry[qbit], &ik);
|
||||
|
||||
#if 0
|
||||
bwtintv_t ok;
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
#else
|
||||
bwtintv_t ok[4];
|
||||
for (i = x; i < qlen; ++i)
|
||||
{
|
||||
if (bval(q[i]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
ik = ok[c1];
|
||||
ik.info = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 将用字符串表示的序列计算成用2bit表示的序列
|
||||
uint32_t str2bit(string &str)
|
||||
{
|
||||
uint32_t pac = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
pac = (pac << 2) | bval(str[i]);
|
||||
return pac;
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(vector<string> &vseq, int kmer_len)
|
||||
{
|
||||
uint64_t seq_up_val = (1 << (kmer_len << 1));
|
||||
vseq.clear();
|
||||
vseq.resize(seq_up_val);
|
||||
for (uint64_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 获取xmer的fmt匹配信息
|
||||
inline void xmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos = 0)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 设置xmer第pos个碱基对应的fmt匹配信息
|
||||
inline void xmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos = 0)
|
||||
{
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
// 创建xmer hash索引
|
||||
void create_xmer_index(FMTIndex *fmt)
|
||||
{
|
||||
vector<string> qarr;
|
||||
gen_all_seq(qarr, 10);
|
||||
bwtintv_t ik;
|
||||
int i, j, c1, c2;
|
||||
int qlen = 10;
|
||||
bwtint_t tk[4], tl[4];
|
||||
XmerHash *xh = &fmt->xmer_hash;
|
||||
|
||||
cout << "qlen: " << qlen << endl;
|
||||
|
||||
xh->xe10 = (XmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(XmerEntryArr));
|
||||
xh->xe11 = (XmerEntry *)malloc((1 << (11 << 1)) * sizeof(XmerEntry));
|
||||
xh->xe12 = (XmerEntry *)malloc((1 << (12 << 1)) * sizeof(XmerEntry));
|
||||
xh->xe13 = (XmerEntry *)malloc((1 << (13 << 1)) * sizeof(XmerEntry));
|
||||
xh->xe14 = (XmerEntry *)malloc((1 << (14 << 1)) * sizeof(XmerEntry));
|
||||
|
||||
// 长度为10的kmer
|
||||
for (j = 0; j < (int)qarr.size(); ++j)
|
||||
{
|
||||
string &q = qarr[j];
|
||||
uint8_t *mem_addr = xh->xe10[j].intv_arr;
|
||||
fmt_set_intv(fmt, bval(q[0]), ik);
|
||||
xmer_setval_at(mem_addr, ik, 0);
|
||||
|
||||
// 每次扩展两个碱基
|
||||
for (i = 1; i + 1 < qlen; i += 2)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
xmer_setval_at(mem_addr, ik, i);
|
||||
|
||||
// 第二次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
|
||||
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
|
||||
ik.x[2] = tl[3] - tk[3];
|
||||
xmer_setval_at(mem_addr, ik, i + 1);
|
||||
}
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
c2 = 3;
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
xmer_setval_at(mem_addr, ik, i);
|
||||
}
|
||||
}
|
||||
|
||||
// 长度11的kmer
|
||||
gen_all_seq(qarr, 11);
|
||||
for (j = 0; j < (int)qarr.size(); ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, qarr[j]);
|
||||
xmer_setval_at(fmt->xmer_hash.xe11[j].intv_arr, p);
|
||||
}
|
||||
|
||||
// 长度12的kmer
|
||||
gen_all_seq(qarr, 12);
|
||||
for (j = 0; j < (int)qarr.size(); ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, qarr[j]);
|
||||
xmer_setval_at(fmt->xmer_hash.xe12[j].intv_arr, p);
|
||||
}
|
||||
//string q = "CACATATTGGCG";
|
||||
//uint32_t qbit = 0;
|
||||
//for (i = 0; i < 12; ++i)
|
||||
//{
|
||||
// qbit |= bval(q[i]) << ((11 - i) << 1);
|
||||
//}
|
||||
//cout << "qbit: " << qbit << endl;
|
||||
//xmer_getval_at(fmt->xmer_hash.xe12[qbit].intv_arr, &ik);
|
||||
|
||||
// 长度13的kmer
|
||||
gen_all_seq(qarr, 13);
|
||||
for (j = 0; j < (int)qarr.size(); ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, qarr[j]);
|
||||
xmer_setval_at(fmt->xmer_hash.xe13[j].intv_arr, p);
|
||||
}
|
||||
|
||||
// 长度14的kmer
|
||||
gen_all_seq(qarr, 14);
|
||||
for (j = 0; j < (int)qarr.size(); ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, qarr[j]);
|
||||
xmer_setval_at(fmt->xmer_hash.xe14[j].intv_arr, p);
|
||||
}
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,利用xmer加速搜索过程
|
||||
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok;
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
int max_xmer_len = 14;
|
||||
// 先利用kmer进行索引查询
|
||||
uint32_t qbit = 0;
|
||||
x = min(max_xmer_len, qlen);
|
||||
for (i = 0; i < x; ++i)
|
||||
{
|
||||
qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1);
|
||||
}
|
||||
if (x == 14)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 13)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 12)
|
||||
{
|
||||
// cout << "qbit: " << qbit << endl;
|
||||
xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik);
|
||||
}
|
||||
else if (x == 11)
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik);
|
||||
}
|
||||
else
|
||||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
|
||||
}
|
||||
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
|
|
@ -768,43 +1045,70 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q)
|
|||
return ik;
|
||||
}
|
||||
|
||||
// 将用字符串表示的序列计算成用2bit表示的序列
|
||||
uint32_t str2bit(string &str)
|
||||
uint16_t *restore_kmer_bit(const char *fn, int kmer_len)
|
||||
{
|
||||
uint32_t pac = 0;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
pac = (pac << 2) | bval(str[i]);
|
||||
return pac;
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "rb");
|
||||
fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
|
||||
err_fclose(fp);
|
||||
return kbit;
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(vector<string> &vseq, int kmer_len)
|
||||
void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len)
|
||||
{
|
||||
uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
vseq.clear();
|
||||
vseq.resize(seq_up_val);
|
||||
for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
ostringstream oss;
|
||||
oss << '.' << kmer_len << ".kmer.bit";
|
||||
string kmer_bit = prefix + oss.str();
|
||||
cout << kmer_bit << endl;
|
||||
uint64_t seq_up_val = (1L << (kmer_len << 1));
|
||||
cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl;
|
||||
string seq;
|
||||
seq.resize(kmer_len);
|
||||
int percent = 1;
|
||||
double tt = realtime();
|
||||
for (uint64_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
if ((i+1) % (seq_up_val / 100) == 0)
|
||||
{
|
||||
tt = realtime() - tt;
|
||||
cout << percent++ << "\% finished, " << tt << "s elapsed." << endl;
|
||||
tt = realtime();
|
||||
}
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
vseq[i].push_back(BASE[(i >> (j << 1)) & 3]);
|
||||
seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
}
|
||||
uint32_t pos = i >> 4;
|
||||
uint32_t bit_pos = i & 0xf;
|
||||
bwtintv_t p = fmt_search_use_xmer(fmt, seq);
|
||||
if (p.x[2] > 0) {
|
||||
kbit[pos] |= 1 << bit_pos;
|
||||
}
|
||||
}
|
||||
FILE *fp;
|
||||
fp = xopen(kmer_bit.c_str(), "wb");
|
||||
err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
#define GEN_BWT_IDX 0
|
||||
#define GEN_FMT_IDX 0
|
||||
#define GEN_KMER_IDX 0
|
||||
#define GEN_XMER_IDX 0
|
||||
#define GEN_FULL_ENTRY_IDX 0
|
||||
#define CMP_FMT_BWT_TIME 1
|
||||
#define CMP_FMT_BWT_TIME 0
|
||||
#define CMP_FMT_BWT_RESULT 1
|
||||
|
||||
// argv[1] 应该是索引的前缀
|
||||
int main_fmtidx(int argc, char **argv)
|
||||
{
|
||||
//cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl;
|
||||
string prefix = argv[1];
|
||||
string bwt_str = prefix + ".bwt.str";
|
||||
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx;
|
||||
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx, xmer_idx;
|
||||
bwt_t *bwt;
|
||||
FMTIndex *fmt;
|
||||
ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry;
|
||||
|
|
@ -820,11 +1124,14 @@ int main_fmtidx(int argc, char **argv)
|
|||
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
||||
kmer_idx = prefix + oss_kmer.str() + ".kmer";
|
||||
full_entry_idx = prefix + oss_full_entry.str() + ".full_entry";
|
||||
xmer_idx = prefix + ".14.xmer";
|
||||
|
||||
cout << bwt_idx << endl;
|
||||
cout << fmt_idx << endl;
|
||||
cout << kmer_idx << endl;
|
||||
cout << full_entry_idx << endl;
|
||||
cout << xmer_idx << endl;
|
||||
|
||||
|
||||
// 生成或读取bwt索引文件
|
||||
double time_read_bwt = realtime();
|
||||
|
|
@ -896,35 +1203,53 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_gen_full_entry = realtime() - time_gen_full_entry;
|
||||
cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl;
|
||||
|
||||
// 生成xmer索引文件
|
||||
double time_gen_xmer_hash = realtime();
|
||||
#if GEN_XMER_IDX
|
||||
create_xmer_index(fmt);
|
||||
dump_xmer_idx(xmer_idx.c_str(), &fmt->xmer_hash);
|
||||
#else
|
||||
fmt->xmer_hash = restore_xmer_idx(xmer_idx.c_str());
|
||||
#endif
|
||||
time_gen_xmer_hash = realtime() - time_gen_xmer_hash;
|
||||
cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl;
|
||||
|
||||
int kmer_bit_len = 18;
|
||||
//create_kmer_bit(fmt, prefix, kmer_bit_len);
|
||||
// return 0;
|
||||
|
||||
// 生成随机序列进行测试
|
||||
int seq_size = 10000000;
|
||||
//int seq_size = 1;
|
||||
int seq_len = 14;
|
||||
int seq_len = kmer_bit_len;
|
||||
vector<string> seq_arr(seq_size);
|
||||
// seq_arr[0] = "GCGATACTAAGA";
|
||||
seq_arr[0] = "CACATATTGGCG";
|
||||
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
|
||||
seq_arr[0] = "ACCTTTCGAATTGA";
|
||||
// seq_arr[0] = "ACCTTTCGAATTGA";
|
||||
#if 1
|
||||
srand(time(NULL));
|
||||
double time_gen_seq = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
seq_arr[i] = generate_rand_seq(seq_len);
|
||||
time_gen_seq = realtime() - time_gen_seq;
|
||||
cout << "[time gen seq:] " << time_gen_seq << "s" << endl;
|
||||
cout << "[time gen seq:] " << time_gen_seq << "s" << endl;
|
||||
#endif
|
||||
|
||||
// 对比bwt和fmt搜索的时间
|
||||
bwtint_t max_gap = 0;
|
||||
|
||||
#if CMP_FMT_BWT_TIME
|
||||
// int nintv_0 = 0;
|
||||
int nintv_0 = 0;
|
||||
//bwtint_t max_gap = 0;
|
||||
double time_bwt_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i) {
|
||||
bwtintv_t p = bwt_search(bwt, seq_arr[i]);
|
||||
if (max_gap < p.x[2])
|
||||
max_gap = p.x[2];
|
||||
//if (max_gap < p.x[2])
|
||||
// max_gap = p.x[2];
|
||||
if (p.x[2] == 0)
|
||||
++nintv_0;
|
||||
}
|
||||
// cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||
cout << "max gap: " << max_gap << endl;
|
||||
cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||
//cout << "max gap: " << max_gap << endl;
|
||||
time_bwt_search = realtime() - time_bwt_search;
|
||||
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
|
||||
|
||||
|
|
@ -942,15 +1267,23 @@ int main_fmtidx(int argc, char **argv)
|
|||
|
||||
double time_fmt_full_entry_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_full_entry(fmt, seq_arr[i]);
|
||||
fmt_search_use_full_entry(fmt, bwt, seq_arr[i]);
|
||||
time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search;
|
||||
cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl;
|
||||
|
||||
double time_xmer_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_xmer(fmt, seq_arr[i]);
|
||||
time_xmer_search = realtime() - time_xmer_search;
|
||||
cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
|
||||
#endif
|
||||
// cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
|
||||
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
|
||||
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
|
||||
|
||||
#endif
|
||||
ostringstream oss_kmer_bit;
|
||||
oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit";
|
||||
uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len);
|
||||
|
||||
// 对比bwt和fmt搜索的结果
|
||||
#if CMP_FMT_BWT_RESULT
|
||||
|
|
@ -959,11 +1292,25 @@ int main_fmtidx(int argc, char **argv)
|
|||
{
|
||||
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search_use_full_entry(fmt, seq_arr[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << seq_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
|
||||
uint64_t qbit = 0;
|
||||
for (int k = 0; k < kmer_bit_len; ++k)
|
||||
{
|
||||
uint64_t base = bval(seq_arr[i][k]);
|
||||
qbit |= base << ((kmer_bit_len - 1 - k) << 1);
|
||||
}
|
||||
uint32_t pos = qbit >> 4;
|
||||
uint32_t bit_pos = qbit & 0xf;
|
||||
//cout << pos << '\t' << bit_pos << '\t' << kbit[pos] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
|
||||
if ((p2.x[2] >0) != ((kbit[pos] & (1 << bit_pos)) > 0))
|
||||
cout << "diff: " << seq_arr[i] << '\t' << p2.x[2] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
|
||||
}
|
||||
time_cmp = realtime() - time_cmp;
|
||||
cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl;
|
||||
|
|
|
|||
22
fmt_index.h
22
fmt_index.h
|
|
@ -55,6 +55,27 @@ struct FullEntry
|
|||
uint8_t intv_arr[14];
|
||||
};
|
||||
|
||||
|
||||
// x-mer entry, 1 - 14个碱基组成的hash key
|
||||
struct XmerEntry
|
||||
{
|
||||
uint8_t intv_arr[14];
|
||||
};
|
||||
|
||||
struct XmerEntryArr
|
||||
{
|
||||
uint8_t intv_arr[140];
|
||||
};
|
||||
|
||||
struct XmerHash
|
||||
{
|
||||
XmerEntryArr *xe10;
|
||||
XmerEntry *xe11;
|
||||
XmerEntry *xe12;
|
||||
XmerEntry *xe13;
|
||||
XmerEntry *xe14;
|
||||
};
|
||||
|
||||
// fm-index, extend twice in one search step (one memory access)
|
||||
struct FMTIndex
|
||||
{
|
||||
|
|
@ -72,6 +93,7 @@ struct FMTIndex
|
|||
// 保存kmer对应的fmt位置信息
|
||||
KmerEntry *kmer_entry;
|
||||
FullEntry *full_entry;
|
||||
XmerHash xmer_hash;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
|
|
|
|||
Loading…
Reference in New Issue