在这个基础上修改用于测试fmt消融测试的版本
This commit is contained in:
parent
b0cfa3630e
commit
bad3112151
|
|
@ -2,6 +2,7 @@
|
|||
"files.associations": {
|
||||
"ostream": "cpp",
|
||||
"iostream": "cpp",
|
||||
"string": "cpp"
|
||||
"string": "cpp",
|
||||
"sstream": "cpp"
|
||||
}
|
||||
}
|
||||
197
fmt_index.cpp
197
fmt_index.cpp
|
|
@ -764,66 +764,6 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
|
|||
return ik;
|
||||
}
|
||||
|
||||
// 利用fmt搜索seed,利用full entry加速搜索过程
|
||||
bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
// 先利用kmer进行索引查询
|
||||
uint32_t qbit = 0;
|
||||
x = min(FULL_ENTRY_LEN, qlen);
|
||||
for (i = 0; i < x; ++i)
|
||||
{
|
||||
qbit |= bval(q[i]) << ((FULL_ENTRY_LEN - 1 - i) << 1);
|
||||
}
|
||||
full_entry_get(&fmt->full_entry[qbit], &ik);
|
||||
|
||||
#if 0
|
||||
bwtintv_t ok;
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
c2 = cbval(q[i + 1]);
|
||||
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < qlen && bval(q[i]) < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = cbval(q[i]);
|
||||
fmt_extend1(fmt, &ik, &ok, 0, c1);
|
||||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
#else
|
||||
bwtintv_t ok[4];
|
||||
for (i = x; i < qlen; ++i)
|
||||
{
|
||||
if (bval(q[i]) < 4)
|
||||
{
|
||||
c1 = cbval(q[i]);
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
ik = ok[c1];
|
||||
ik.info = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 将用字符串表示的序列计算成用2bit表示的序列
|
||||
uint32_t str2bit(string &str)
|
||||
{
|
||||
|
|
@ -982,10 +922,9 @@ void create_xmer_index(FMTIndex *fmt)
|
|||
}
|
||||
|
||||
// 利用fmt搜索seed,利用xmer加速搜索过程
|
||||
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q)
|
||||
bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok;
|
||||
int i, c1, c2, x = 0;
|
||||
int qlen = (int)q.size();
|
||||
|
||||
|
|
@ -1018,7 +957,8 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q)
|
|||
{
|
||||
xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1);
|
||||
}
|
||||
|
||||
#if 0
|
||||
bwtintv_t ok;
|
||||
// 每次扩展两个碱基
|
||||
for (i = x; i + 1 < qlen; i += 2)
|
||||
{
|
||||
|
|
@ -1042,6 +982,19 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q)
|
|||
ik = ok;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
#else
|
||||
bwtintv_t ok[4];
|
||||
for (i = x; i < qlen; ++i) {
|
||||
if (bval(q[i]) < 4) {
|
||||
c1 = cbval(q[i]);
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
ik = ok[c1];
|
||||
ik.info = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return ik;
|
||||
}
|
||||
|
||||
|
|
@ -1082,10 +1035,10 @@ void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len)
|
|||
}
|
||||
uint32_t pos = i >> 4;
|
||||
uint32_t bit_pos = i & 0xf;
|
||||
bwtintv_t p = fmt_search_use_xmer(fmt, seq);
|
||||
if (p.x[2] > 0) {
|
||||
kbit[pos] |= 1 << bit_pos;
|
||||
}
|
||||
//bwtintv_t p = fmt_search_use_xmer(fmt, seq);
|
||||
//if (p.x[2] > 0) {
|
||||
// kbit[pos] |= 1 << bit_pos;
|
||||
//}
|
||||
}
|
||||
FILE *fp;
|
||||
fp = xopen(kmer_bit.c_str(), "wb");
|
||||
|
|
@ -1096,11 +1049,9 @@ void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len)
|
|||
|
||||
#define GEN_BWT_IDX 0
|
||||
#define GEN_FMT_IDX 0
|
||||
#define GEN_KMER_IDX 0
|
||||
#define GEN_XMER_IDX 0
|
||||
#define GEN_FULL_ENTRY_IDX 0
|
||||
#define CMP_FMT_BWT_TIME 0
|
||||
#define CMP_FMT_BWT_RESULT 1
|
||||
#define CMP_FMT_BWT_TIME 1
|
||||
#define CMP_FMT_BWT_RESULT 0
|
||||
|
||||
// argv[1] 应该是索引的前缀
|
||||
int main_fmtidx(int argc, char **argv)
|
||||
|
|
@ -1108,28 +1059,21 @@ int main_fmtidx(int argc, char **argv)
|
|||
//cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl;
|
||||
string prefix = argv[1];
|
||||
string bwt_str = prefix + ".bwt.str";
|
||||
string bwt_idx, fmt_idx, kmer_idx, full_entry_idx, xmer_idx;
|
||||
string bwt_idx, fmt_idx, xmer_idx;
|
||||
bwt_t *bwt;
|
||||
FMTIndex *fmt;
|
||||
ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry;
|
||||
ostringstream oss_bwt, oss_fmt, oss_xmer;
|
||||
oss_bwt << '.' << OCC_INTERVAL;
|
||||
oss_fmt << '.' << FMT_OCC_INTERVAL;
|
||||
oss_kmer << '.' << KMER_LEN;
|
||||
oss_full_entry << '.' << FULL_ENTRY_LEN;
|
||||
#ifdef FMT_MID_INTERVAL
|
||||
oss_xmer << '.' << XMER_LEN;
|
||||
oss_fmt << '.' << FMT_MID_INTERVAL;
|
||||
#endif
|
||||
|
||||
bwt_idx = prefix + oss_bwt.str() + ".bwt";
|
||||
fmt_idx = prefix + oss_fmt.str() + ".fmt";
|
||||
kmer_idx = prefix + oss_kmer.str() + ".kmer";
|
||||
full_entry_idx = prefix + oss_full_entry.str() + ".full_entry";
|
||||
xmer_idx = prefix + ".14.xmer";
|
||||
xmer_idx = prefix + ".14.kmer";
|
||||
|
||||
cout << bwt_idx << endl;
|
||||
cout << fmt_idx << endl;
|
||||
cout << kmer_idx << endl;
|
||||
cout << full_entry_idx << endl;
|
||||
cout << xmer_idx << endl;
|
||||
|
||||
|
||||
|
|
@ -1156,53 +1100,6 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_read_fmt = realtime() - time_read_fmt;
|
||||
cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl;
|
||||
|
||||
// 生成或读取kmer信息
|
||||
double time_gen_kmer = realtime();
|
||||
#if GEN_KMER_IDX
|
||||
vector<string> vkmer;
|
||||
gen_all_seq(vkmer, KMER_LEN);
|
||||
fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry));
|
||||
for (size_t i = 0; i < vkmer.size(); ++i)
|
||||
{
|
||||
fmt_search_store_kmer(fmt, vkmer[i], fmt->kmer_entry[i]);
|
||||
bwtintv_t p1;
|
||||
kmer_getval_at(&fmt->kmer_entry[i], &p1, KMER_LEN - 1);
|
||||
bwtintv_t p2 = bwt_search(bwt, vkmer[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << vkmer[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
}
|
||||
dump_kmer_idx(kmer_idx.c_str(), fmt->kmer_entry);
|
||||
#else
|
||||
fmt->kmer_entry = restore_kmer_idx(kmer_idx.c_str());
|
||||
#endif
|
||||
time_gen_kmer = realtime() - time_gen_kmer;
|
||||
cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl;
|
||||
|
||||
// 生成或读取kmer匹配的最后单一结果
|
||||
double time_gen_full_entry = realtime();
|
||||
#if GEN_FULL_ENTRY_IDX
|
||||
vector<string> kmer_arr;
|
||||
gen_all_seq(kmer_arr, FULL_ENTRY_LEN);
|
||||
fmt->full_entry = (FullEntry *)malloc(FULL_ENTRY_ARR_SIZE * sizeof(FullEntry));
|
||||
for (size_t i = 0; i < kmer_arr.size(); ++i)
|
||||
{
|
||||
bwtintv_t p1 = fmt_search(fmt, kmer_arr[i]);
|
||||
full_entry_set(&fmt->full_entry[i], p1);
|
||||
bwtintv_t p2 = bwt_search(bwt, kmer_arr[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << kmer_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
}
|
||||
dump_full_entry_idx(full_entry_idx.c_str(), fmt->full_entry);
|
||||
#else
|
||||
fmt->full_entry = restore_full_entry_idx(full_entry_idx.c_str());
|
||||
#endif
|
||||
time_gen_full_entry = realtime() - time_gen_full_entry;
|
||||
cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl;
|
||||
|
||||
// 生成xmer索引文件
|
||||
double time_gen_xmer_hash = realtime();
|
||||
#if GEN_XMER_IDX
|
||||
|
|
@ -1238,17 +1135,17 @@ int main_fmtidx(int argc, char **argv)
|
|||
// 对比bwt和fmt搜索的时间
|
||||
|
||||
#if CMP_FMT_BWT_TIME
|
||||
int nintv_0 = 0;
|
||||
//int nintv_0 = 0;
|
||||
//bwtint_t max_gap = 0;
|
||||
double time_bwt_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i) {
|
||||
bwtintv_t p = bwt_search(bwt, seq_arr[i]);
|
||||
//if (max_gap < p.x[2])
|
||||
// max_gap = p.x[2];
|
||||
if (p.x[2] == 0)
|
||||
++nintv_0;
|
||||
//if (p.x[2] == 0)
|
||||
// ++nintv_0;
|
||||
}
|
||||
cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||
//cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl;
|
||||
//cout << "max gap: " << max_gap << endl;
|
||||
time_bwt_search = realtime() - time_bwt_search;
|
||||
cout << "[time bwt search:] " << time_bwt_search << "s" << endl;
|
||||
|
|
@ -1259,21 +1156,9 @@ int main_fmtidx(int argc, char **argv)
|
|||
time_fmt_search = realtime() - time_fmt_search;
|
||||
cout << "[time fmt search:] " << time_fmt_search << "s" << endl;
|
||||
|
||||
double time_fmt_kmer_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
time_fmt_kmer_search = realtime() - time_fmt_kmer_search;
|
||||
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
|
||||
|
||||
double time_fmt_full_entry_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_full_entry(fmt, bwt, seq_arr[i]);
|
||||
time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search;
|
||||
cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl;
|
||||
|
||||
double time_xmer_search = realtime();
|
||||
for (int i = 0; i < (int)seq_arr.size(); ++i)
|
||||
fmt_search_use_xmer(fmt, seq_arr[i]);
|
||||
fmt_search_use_xmer(fmt, bwt, seq_arr[i]);
|
||||
time_xmer_search = realtime() - time_xmer_search;
|
||||
cout << "[time xmer search:] " << time_xmer_search << "s" << endl;
|
||||
#endif
|
||||
|
|
@ -1281,9 +1166,9 @@ int main_fmtidx(int argc, char **argv)
|
|||
// cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
|
||||
// cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
|
||||
|
||||
ostringstream oss_kmer_bit;
|
||||
oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit";
|
||||
uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len);
|
||||
// ostringstream oss_kmer_bit;
|
||||
// oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit";
|
||||
// uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len);
|
||||
|
||||
// 对比bwt和fmt搜索的结果
|
||||
#if CMP_FMT_BWT_RESULT
|
||||
|
|
@ -1292,25 +1177,11 @@ int main_fmtidx(int argc, char **argv)
|
|||
{
|
||||
bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search(fmt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]);
|
||||
bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]);
|
||||
// bwtintv_t p2 = fmt_search_use_full_entry(fmt, seq_arr[i]);
|
||||
if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2])
|
||||
cout << seq_arr[i] << endl
|
||||
<< p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl
|
||||
<< p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl;
|
||||
|
||||
uint64_t qbit = 0;
|
||||
for (int k = 0; k < kmer_bit_len; ++k)
|
||||
{
|
||||
uint64_t base = bval(seq_arr[i][k]);
|
||||
qbit |= base << ((kmer_bit_len - 1 - k) << 1);
|
||||
}
|
||||
uint32_t pos = qbit >> 4;
|
||||
uint32_t bit_pos = qbit & 0xf;
|
||||
//cout << pos << '\t' << bit_pos << '\t' << kbit[pos] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
|
||||
if ((p2.x[2] >0) != ((kbit[pos] & (1 << bit_pos)) > 0))
|
||||
cout << "diff: " << seq_arr[i] << '\t' << p2.x[2] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl;
|
||||
}
|
||||
time_cmp = realtime() - time_cmp;
|
||||
cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl;
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
|
||||
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
|
||||
|
||||
#define FMT_MID_INTV_SHIFT 5
|
||||
#define FMT_MID_INTV_SHIFT 6
|
||||
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
|
||||
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
|
||||
|
||||
|
|
@ -40,6 +40,9 @@
|
|||
|
||||
#define KMER_LEN 12
|
||||
#define KMER_ARR_SIZE (1 << (KMER_LEN << 1))
|
||||
|
||||
#define XMER_LEN 14
|
||||
|
||||
// 用来保存kmer对应的fmt的位置信息
|
||||
struct KmerEntry
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue