diff --git a/bwt.cpp b/bwt.cpp index 939d53d..942d73f 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -251,25 +251,9 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q) bwtintv_t ik, ok[4]; int i, c, x = 0; - //double tt = realtime(); - bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; - for (i = x + 1; i < 12; ++i) - { - if (bval(q[i]) < 4) - { - c = cbval(q[i]); - bwt_extend(bwt, &ik, ok, 0); - ik = ok[c]; - ik.info = i + 1; - } - } - //t5 += realtime() - tt; - x = 11; - - //tt = realtime(); for (i = x + 1; i < (int)q.size(); ++i) { if (bval(q[i]) < 4) @@ -280,7 +264,6 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q) ik.info = i + 1; } } - //t6 += realtime() - tt; return ik; } diff --git a/fmt_index.cpp b/fmt_index.cpp index 2606d65..caeaf8e 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -175,6 +175,26 @@ KmerEntry *restore_kmer_idx(const char *fn) return kmer_entry; } +// 将full kmer hash数据写入到文件 +void dump_full_entry_idx(const char *fn, const FullEntry *full_entry) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(full_entry, 1, FULL_ENTRY_ARR_SIZE * sizeof(FullEntry), fp); + err_fflush(fp); + err_fclose(fp); +} + +// 从文件中读取full kmer hash信息 +FullEntry *restore_full_entry_idx(const char *fn) +{ + FILE *fp = xopen(fn, "rb"); + uint32_t full_entry_bytes = FULL_ENTRY_ARR_SIZE * sizeof(FullEntry); + FullEntry *full_entry = (FullEntry *)malloc(full_entry_bytes); + fread_fix(fp, full_entry_bytes, full_entry); + err_fclose(fp); + return full_entry; +} // 从文件中读取fmt结构数据 FMTIndex *restore_fmt(const char *fn) { @@ -475,7 +495,7 @@ void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4] } // 扩展两个碱基 -void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2) +inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2) { bwtint_t tk[4], tl[4], first_pos; // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 @@ -492,7 +512,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, } // 扩展一个碱基 -void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1) +inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1) { bwtint_t tk[4], tl[4]; int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量 @@ -514,29 +534,8 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) int i, c1, c2, x = 0; int qlen = (int)q.size(); - //double tt = realtime(); fmt_set_intv(fmt, bval(q[x]), ik); ik.info = x + 1; - for (i = x + 1; i + 1 < 12; i += 2) - { - if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) - { - c1 = cbval(q[i]); - c2 = cbval(q[i + 1]); - fmt_extend2(fmt, &ik, &ok, 0, c1, c2); - ik = ok; - ik.info = i + 1; - } - else - { - break; - } - } - x = 11; - //t3 += realtime() - tt; - - //tt = realtime(); - // 每次扩展两个碱基 for (i = x + 1; i + 1 < qlen; i += 2) { if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) @@ -546,20 +545,22 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) fmt_extend2(fmt, &ik, &ok, 0, c1, c2); ik = ok; ik.info = i + 1; + //cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl; } else { break; } } + if (i < qlen && bval(q[i]) < 4) { // 最后一次扩展 c1 = cbval(q[i]); fmt_extend1(fmt, &ik, &ok, 0, c1); ik = ok; ik.info = i + 1; + //cout << ik.x[0] << ' ' << ik.x[1] << ' ' << ik.x[2] << endl; } - //t4 += realtime() - tt; return ik; } @@ -595,6 +596,34 @@ inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos) *((uint32_t *)arr) = (uint32_t)ik.x[2]; } +inline void full_entry_set(FullEntry *fe, bwtintv_t ik) +{ + uint8_t *arr = fe->intv_arr; + arr[0] = (uint8_t)(ik.x[0] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; + arr += 5; + arr[0] = (uint8_t)(ik.x[1] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; + arr += 5; + *((uint32_t *)arr) = (uint32_t)ik.x[2]; +} + +inline void full_entry_get(FullEntry *fe, bwtintv_t *ok) +{ + bwtint_t x0, x1, x2; + uint8_t *arr = fe->intv_arr; + x0 = *arr; + x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x1 = *arr; + x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x2 = *((uint32_t *)arr); + ok->x[0] = x0; + ok->x[1] = x1; + ok->x[2] = x2; +} + // 查找并保存kmer中每扩展一个碱基对应的fmt位置信息 void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke) { @@ -687,10 +716,55 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) { // 最后一次扩展 c1 = cbval(q[i]); fmt_extend1(fmt, &ik, &ok, 0, c1); + //fmt_extend2(fmt, &ik, &ok, 0, c1, 3); ik = ok; ik.info = i + 1; } //t2 += realtime() - tt; + //cout << ik.x[2] << endl; + return ik; +} + +// 利用fmt搜索seed,利用full entry加速搜索过程 +bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q) +{ + bwtintv_t ik; + bwtintv_t ok; + int i, c1, c2, x = 0; + int qlen = (int)q.size(); + + // 先利用kmer进行索引查询 + uint32_t qbit = 0; + x = min(FULL_ENTRY_LEN, qlen); + for (i = 0; i < x; ++i) + { + qbit |= bval(q[i]) << ((FULL_ENTRY_LEN - 1 - i) << 1); + } + full_entry_get(&fmt->full_entry[qbit], &ik); + + // 每次扩展两个碱基 + for (i = x; i + 1 < qlen; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + ik = ok; + ik.info = i + 1; + } + else + { + break; + } + } + if (i < qlen && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + fmt_extend1(fmt, &ik, &ok, 0, c1); + ik = ok; + ik.info = i + 1; + } return ik; } @@ -721,6 +795,7 @@ void gen_all_seq(vector &vseq, int kmer_len) #define GEN_BWT_IDX 0 #define GEN_FMT_IDX 0 #define GEN_KMER_IDX 0 +#define GEN_FULL_ENTRY_IDX 0 #define CMP_FMT_BWT_TIME 1 #define CMP_FMT_BWT_RESULT 1 @@ -729,13 +804,14 @@ int main_fmtidx(int argc, char **argv) { string prefix = argv[1]; string bwt_str = prefix + ".bwt.str"; - string bwt_idx, fmt_idx, kmer_idx; + string bwt_idx, fmt_idx, kmer_idx, full_entry_idx; bwt_t *bwt; FMTIndex *fmt; - ostringstream oss_bwt, oss_fmt, oss_kmer; + ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry; oss_bwt << '.' << OCC_INTERVAL; oss_fmt << '.' << FMT_OCC_INTERVAL; oss_kmer << '.' << KMER_LEN; + oss_full_entry << '.' << FULL_ENTRY_LEN; #ifdef FMT_MID_INTERVAL oss_fmt << '.' << FMT_MID_INTERVAL; #endif @@ -743,10 +819,12 @@ int main_fmtidx(int argc, char **argv) bwt_idx = prefix + oss_bwt.str() + ".bwt"; fmt_idx = prefix + oss_fmt.str() + ".fmt"; kmer_idx = prefix + oss_kmer.str() + ".kmer"; + full_entry_idx = prefix + oss_full_entry.str() + ".full_entry"; cout << bwt_idx << endl; cout << fmt_idx << endl; cout << kmer_idx << endl; + cout << full_entry_idx << endl; // 生成或读取bwt索引文件 double time_read_bwt = realtime(); @@ -772,9 +850,9 @@ int main_fmtidx(int argc, char **argv) cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl; // 生成或读取kmer信息 - vector vkmer; double time_gen_kmer = realtime(); #if GEN_KMER_IDX + vector vkmer; gen_all_seq(vkmer, KMER_LEN); fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry)); for (size_t i = 0; i < vkmer.size(); ++i) @@ -795,13 +873,37 @@ int main_fmtidx(int argc, char **argv) time_gen_kmer = realtime() - time_gen_kmer; cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl; +// 生成或读取kmer匹配的最后单一结果 + double time_gen_full_entry = realtime(); +#if GEN_FULL_ENTRY_IDX + vector kmer_arr; + gen_all_seq(kmer_arr, FULL_ENTRY_LEN); + fmt->full_entry = (FullEntry *)malloc(FULL_ENTRY_ARR_SIZE * sizeof(FullEntry)); + for (size_t i = 0; i < kmer_arr.size(); ++i) + { + bwtintv_t p1 = fmt_search(fmt, kmer_arr[i]); + full_entry_set(&fmt->full_entry[i], p1); + bwtintv_t p2 = bwt_search(bwt, kmer_arr[i]); + if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) + cout << kmer_arr[i] << endl + << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl + << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; + } + dump_full_entry_idx(full_entry_idx.c_str(), fmt->full_entry); +#else + fmt->full_entry = restore_full_entry_idx(full_entry_idx.c_str()); +#endif + time_gen_full_entry = realtime() - time_gen_full_entry; + cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl; + // 生成随机序列进行测试 int seq_size = 10000000; - // int seq_size = 1; - int seq_len = 23; + //int seq_size = 1; + int seq_len = 14; vector seq_arr(seq_size); // seq_arr[0] = "GCGATACTAAGA"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; + seq_arr[0] = "ACCTTTCGAATTGA"; #if 1 srand(time(NULL)); double time_gen_seq = realtime(); @@ -812,10 +914,17 @@ int main_fmtidx(int argc, char **argv) #endif // 对比bwt和fmt搜索的时间 + bwtint_t max_gap = 0; #if CMP_FMT_BWT_TIME + // int nintv_0 = 0; double time_bwt_search = realtime(); - for (int i = 0; i < (int)seq_arr.size(); ++i) - bwt_search(bwt, seq_arr[i]); + for (int i = 0; i < (int)seq_arr.size(); ++i) { + bwtintv_t p = bwt_search(bwt, seq_arr[i]); + if (max_gap < p.x[2]) + max_gap = p.x[2]; + } + // cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; + cout << "max gap: " << max_gap << endl; time_bwt_search = realtime() - time_bwt_search; cout << "[time bwt search:] " << time_bwt_search << "s" << endl; @@ -831,16 +940,22 @@ int main_fmtidx(int argc, char **argv) time_fmt_kmer_search = realtime() - time_fmt_kmer_search; cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl; - cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; - cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; - cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; + double time_fmt_full_entry_search = realtime(); + for (int i = 0; i < (int)seq_arr.size(); ++i) + fmt_search_use_full_entry(fmt, seq_arr[i]); + time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search; + cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl; + + // cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; + // cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; + // cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; #endif // 对比bwt和fmt搜索的结果 #if CMP_FMT_BWT_RESULT double time_cmp = realtime(); - for (int i = 0; i < (int)seq_arr.size() / 100; ++i) + for (int i = 0; i < (int)seq_arr.size(); ++i) { bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); diff --git a/fmt_index.h b/fmt_index.h index 07114c7..f002302 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -39,7 +39,7 @@ ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) #define KMER_LEN 12 -#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1))) +#define KMER_ARR_SIZE (1 << (KMER_LEN << 1)) // 用来保存kmer对应的fmt的位置信息 struct KmerEntry { @@ -47,6 +47,14 @@ struct KmerEntry uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 }; +#define FULL_ENTRY_LEN 14 +#define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1)) +// kmer全部匹配对应的最后fmt匹配结果 +struct FullEntry +{ + uint8_t intv_arr[14]; +}; + // fm-index, extend twice in one search step (one memory access) struct FMTIndex { @@ -63,6 +71,7 @@ struct FMTIndex uint8_t last_base; // dollar转换成的base // 保存kmer对应的fmt位置信息 KmerEntry *kmer_entry; + FullEntry *full_entry; // suffix array int sa_intv; bwtint_t n_sa;