实现了kmer index,有效果,但是随着序列扩展到后边,扩展的耗时越来越长

This commit is contained in:
zzh 2024-02-03 15:54:52 +08:00
parent 914cbd34ab
commit 46822bef2c
3 changed files with 46 additions and 7 deletions

17
bwt.cpp
View File

@ -251,9 +251,25 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q)
bwtintv_t ik, ok[4]; bwtintv_t ik, ok[4];
int i, c, x = 0; int i, c, x = 0;
//double tt = realtime();
bwt_set_intv(bwt, bval(q[x]), ik); bwt_set_intv(bwt, bval(q[x]), ik);
ik.info = x + 1; ik.info = x + 1;
for (i = x + 1; i < 12; ++i)
{
if (bval(q[i]) < 4)
{
c = cbval(q[i]);
bwt_extend(bwt, &ik, ok, 0);
ik = ok[c];
ik.info = i + 1;
}
}
//t5 += realtime() - tt;
x = 11;
//tt = realtime();
for (i = x + 1; i < (int)q.size(); ++i) for (i = x + 1; i < (int)q.size(); ++i)
{ {
if (bval(q[i]) < 4) if (bval(q[i]) < 4)
@ -264,6 +280,7 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q)
ik.info = i + 1; ik.info = i + 1;
} }
} }
//t6 += realtime() - tt;
return ik; return ik;
} }

View File

@ -514,9 +514,28 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
int i, c1, c2, x = 0; int i, c1, c2, x = 0;
int qlen = (int)q.size(); int qlen = (int)q.size();
//double tt = realtime();
fmt_set_intv(fmt, bval(q[x]), ik); fmt_set_intv(fmt, bval(q[x]), ik);
ik.info = x + 1; ik.info = x + 1;
for (i = x + 1; i + 1 < 12; i += 2)
{
if (bval(q[i]) < 4 && bval(q[i + 1]) < 4)
{
c1 = cbval(q[i]);
c2 = cbval(q[i + 1]);
fmt_extend2(fmt, &ik, &ok, 0, c1, c2);
ik = ok;
ik.info = i + 1;
}
else
{
break;
}
}
x = 11;
//t3 += realtime() - tt;
//tt = realtime();
// 每次扩展两个碱基 // 每次扩展两个碱基
for (i = x + 1; i + 1 < qlen; i += 2) for (i = x + 1; i + 1 < qlen; i += 2)
{ {
@ -540,6 +559,7 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q)
ik = ok; ik = ok;
ik.info = i + 1; ik.info = i + 1;
} }
//t4 += realtime() - tt;
return ik; return ik;
} }
@ -638,15 +658,15 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
// 先利用kmer进行索引查询 // 先利用kmer进行索引查询
uint32_t qbit = 0; uint32_t qbit = 0;
x = min(KMER_LEN, qlen); x = min(KMER_LEN, qlen);
double tt = realtime(); //double tt = realtime();
for (i = 0; i < x; ++i) for (i = 0; i < x; ++i)
{ {
qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1); qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1);
} }
kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1); kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1);
t1 += realtime() - tt; //t1 += realtime() - tt;
tt = realtime(); //tt = realtime();
// 每次扩展两个碱基 // 每次扩展两个碱基
for (i = x; i + 1 < qlen; i += 2) for (i = x; i + 1 < qlen; i += 2)
{ {
@ -670,7 +690,7 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q)
ik = ok; ik = ok;
ik.info = i + 1; ik.info = i + 1;
} }
t2 += realtime() - tt; //t2 += realtime() - tt;
return ik; return ik;
} }
@ -778,7 +798,7 @@ int main_fmtidx(int argc, char **argv)
// 生成随机序列进行测试 // 生成随机序列进行测试
int seq_size = 10000000; int seq_size = 10000000;
// int seq_size = 1; // int seq_size = 1;
int seq_len = 19; int seq_len = 23;
vector<string> seq_arr(seq_size); vector<string> seq_arr(seq_size);
// seq_arr[0] = "GCGATACTAAGA"; // seq_arr[0] = "GCGATACTAAGA";
// seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT";
@ -812,6 +832,8 @@ int main_fmtidx(int argc, char **argv)
cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl; cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl;
cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl;
cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl;
cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl;
#endif #endif

View File

@ -5,7 +5,7 @@
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 4 #define FMT_MID_INTV_SHIFT 5
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT) #define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
@ -38,7 +38,7 @@
#define __fmt_mid_sum(x) \ #define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
#define KMER_LEN 10 #define KMER_LEN 12
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1))) #define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
// 用来保存kmer对应的fmt的位置信息 // 用来保存kmer对应的fmt的位置信息
struct KmerEntry struct KmerEntry