From bad31121510cdf2057688e7c8d6bbea82fad8fea Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 25 Mar 2024 10:41:02 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8=E8=BF=99=E4=B8=AA=E5=9F=BA=E7=A1=80?= =?UTF-8?q?=E4=B8=8A=E4=BF=AE=E6=94=B9=E7=94=A8=E4=BA=8E=E6=B5=8B=E8=AF=95?= =?UTF-8?q?fmt=E6=B6=88=E8=9E=8D=E6=B5=8B=E8=AF=95=E7=9A=84=E7=89=88?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 3 +- fmt_index.cpp | 197 ++++++++---------------------------------- fmt_index.h | 5 +- 3 files changed, 40 insertions(+), 165 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e13a436..e522d91 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -2,6 +2,7 @@ "files.associations": { "ostream": "cpp", "iostream": "cpp", - "string": "cpp" + "string": "cpp", + "sstream": "cpp" } } \ No newline at end of file diff --git a/fmt_index.cpp b/fmt_index.cpp index 778e37d..36f619d 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -764,66 +764,6 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) return ik; } -// 利用fmt搜索seed,利用full entry加速搜索过程 -bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, bwt_t *bwt, const string &q) -{ - bwtintv_t ik; - - int i, c1, c2, x = 0; - int qlen = (int)q.size(); - - // 先利用kmer进行索引查询 - uint32_t qbit = 0; - x = min(FULL_ENTRY_LEN, qlen); - for (i = 0; i < x; ++i) - { - qbit |= bval(q[i]) << ((FULL_ENTRY_LEN - 1 - i) << 1); - } - full_entry_get(&fmt->full_entry[qbit], &ik); - -#if 0 - bwtintv_t ok; - // 每次扩展两个碱基 - for (i = x; i + 1 < qlen; i += 2) - { - if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) - { - c1 = cbval(q[i]); - c2 = cbval(q[i + 1]); - fmt_extend2(fmt, &ik, &ok, 0, c1, c2); - ik = ok; - ik.info = i + 1; - } - else - { - break; - } - } - if (i < qlen && bval(q[i]) < 4) - { // 最后一次扩展 - c1 = cbval(q[i]); - fmt_extend1(fmt, &ik, &ok, 0, c1); - ik = ok; - ik.info = i + 1; - } -#else - bwtintv_t ok[4]; - for (i = x; i < qlen; ++i) - { - if (bval(q[i]) < 4) - { - c1 = cbval(q[i]); - bwt_extend(bwt, &ik, ok, 0); - ik = ok[c1]; - ik.info = i + 1; - } - } - -#endif - - return ik; -} - // 将用字符串表示的序列计算成用2bit表示的序列 uint32_t str2bit(string &str) { @@ -982,10 +922,9 @@ void create_xmer_index(FMTIndex *fmt) } // 利用fmt搜索seed,利用xmer加速搜索过程 -bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q) +bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) { bwtintv_t ik; - bwtintv_t ok; int i, c1, c2, x = 0; int qlen = (int)q.size(); @@ -1018,7 +957,8 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q) { xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1); } - +#if 0 + bwtintv_t ok; // 每次扩展两个碱基 for (i = x; i + 1 < qlen; i += 2) { @@ -1042,6 +982,19 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q) ik = ok; ik.info = i + 1; } +#else + bwtintv_t ok[4]; + for (i = x; i < qlen; ++i) { + if (bval(q[i]) < 4) { + c1 = cbval(q[i]); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[c1]; + ik.info = i + 1; + } + } + +#endif + return ik; } @@ -1082,10 +1035,10 @@ void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len) } uint32_t pos = i >> 4; uint32_t bit_pos = i & 0xf; - bwtintv_t p = fmt_search_use_xmer(fmt, seq); - if (p.x[2] > 0) { - kbit[pos] |= 1 << bit_pos; - } + //bwtintv_t p = fmt_search_use_xmer(fmt, seq); + //if (p.x[2] > 0) { + // kbit[pos] |= 1 << bit_pos; + //} } FILE *fp; fp = xopen(kmer_bit.c_str(), "wb"); @@ -1096,11 +1049,9 @@ void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len) #define GEN_BWT_IDX 0 #define GEN_FMT_IDX 0 -#define GEN_KMER_IDX 0 #define GEN_XMER_IDX 0 -#define GEN_FULL_ENTRY_IDX 0 -#define CMP_FMT_BWT_TIME 0 -#define CMP_FMT_BWT_RESULT 1 +#define CMP_FMT_BWT_TIME 1 +#define CMP_FMT_BWT_RESULT 0 // argv[1] 应该是索引的前缀 int main_fmtidx(int argc, char **argv) @@ -1108,28 +1059,21 @@ int main_fmtidx(int argc, char **argv) //cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl; string prefix = argv[1]; string bwt_str = prefix + ".bwt.str"; - string bwt_idx, fmt_idx, kmer_idx, full_entry_idx, xmer_idx; + string bwt_idx, fmt_idx, xmer_idx; bwt_t *bwt; FMTIndex *fmt; - ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry; + ostringstream oss_bwt, oss_fmt, oss_xmer; oss_bwt << '.' << OCC_INTERVAL; oss_fmt << '.' << FMT_OCC_INTERVAL; - oss_kmer << '.' << KMER_LEN; - oss_full_entry << '.' << FULL_ENTRY_LEN; -#ifdef FMT_MID_INTERVAL + oss_xmer << '.' << XMER_LEN; oss_fmt << '.' << FMT_MID_INTERVAL; -#endif bwt_idx = prefix + oss_bwt.str() + ".bwt"; fmt_idx = prefix + oss_fmt.str() + ".fmt"; - kmer_idx = prefix + oss_kmer.str() + ".kmer"; - full_entry_idx = prefix + oss_full_entry.str() + ".full_entry"; - xmer_idx = prefix + ".14.xmer"; + xmer_idx = prefix + ".14.kmer"; cout << bwt_idx << endl; cout << fmt_idx << endl; - cout << kmer_idx << endl; - cout << full_entry_idx << endl; cout << xmer_idx << endl; @@ -1156,53 +1100,6 @@ int main_fmtidx(int argc, char **argv) time_read_fmt = realtime() - time_read_fmt; cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl; -// 生成或读取kmer信息 - double time_gen_kmer = realtime(); -#if GEN_KMER_IDX - vector vkmer; - gen_all_seq(vkmer, KMER_LEN); - fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry)); - for (size_t i = 0; i < vkmer.size(); ++i) - { - fmt_search_store_kmer(fmt, vkmer[i], fmt->kmer_entry[i]); - bwtintv_t p1; - kmer_getval_at(&fmt->kmer_entry[i], &p1, KMER_LEN - 1); - bwtintv_t p2 = bwt_search(bwt, vkmer[i]); - if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) - cout << vkmer[i] << endl - << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl - << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; - } - dump_kmer_idx(kmer_idx.c_str(), fmt->kmer_entry); -#else - fmt->kmer_entry = restore_kmer_idx(kmer_idx.c_str()); -#endif - time_gen_kmer = realtime() - time_gen_kmer; - cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl; - -// 生成或读取kmer匹配的最后单一结果 - double time_gen_full_entry = realtime(); -#if GEN_FULL_ENTRY_IDX - vector kmer_arr; - gen_all_seq(kmer_arr, FULL_ENTRY_LEN); - fmt->full_entry = (FullEntry *)malloc(FULL_ENTRY_ARR_SIZE * sizeof(FullEntry)); - for (size_t i = 0; i < kmer_arr.size(); ++i) - { - bwtintv_t p1 = fmt_search(fmt, kmer_arr[i]); - full_entry_set(&fmt->full_entry[i], p1); - bwtintv_t p2 = bwt_search(bwt, kmer_arr[i]); - if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) - cout << kmer_arr[i] << endl - << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl - << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; - } - dump_full_entry_idx(full_entry_idx.c_str(), fmt->full_entry); -#else - fmt->full_entry = restore_full_entry_idx(full_entry_idx.c_str()); -#endif - time_gen_full_entry = realtime() - time_gen_full_entry; - cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl; - // 生成xmer索引文件 double time_gen_xmer_hash = realtime(); #if GEN_XMER_IDX @@ -1238,17 +1135,17 @@ int main_fmtidx(int argc, char **argv) // 对比bwt和fmt搜索的时间 #if CMP_FMT_BWT_TIME - int nintv_0 = 0; + //int nintv_0 = 0; //bwtint_t max_gap = 0; double time_bwt_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) { bwtintv_t p = bwt_search(bwt, seq_arr[i]); //if (max_gap < p.x[2]) // max_gap = p.x[2]; - if (p.x[2] == 0) - ++nintv_0; + //if (p.x[2] == 0) + // ++nintv_0; } - cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; + //cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; //cout << "max gap: " << max_gap << endl; time_bwt_search = realtime() - time_bwt_search; cout << "[time bwt search:] " << time_bwt_search << "s" << endl; @@ -1259,21 +1156,9 @@ int main_fmtidx(int argc, char **argv) time_fmt_search = realtime() - time_fmt_search; cout << "[time fmt search:] " << time_fmt_search << "s" << endl; - double time_fmt_kmer_search = realtime(); - for (int i = 0; i < (int)seq_arr.size(); ++i) - fmt_search_use_kmer(fmt, seq_arr[i]); - time_fmt_kmer_search = realtime() - time_fmt_kmer_search; - cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl; - - double time_fmt_full_entry_search = realtime(); - for (int i = 0; i < (int)seq_arr.size(); ++i) - fmt_search_use_full_entry(fmt, bwt, seq_arr[i]); - time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search; - cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl; - double time_xmer_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) - fmt_search_use_xmer(fmt, seq_arr[i]); + fmt_search_use_xmer(fmt, bwt, seq_arr[i]); time_xmer_search = realtime() - time_xmer_search; cout << "[time xmer search:] " << time_xmer_search << "s" << endl; #endif @@ -1281,9 +1166,9 @@ int main_fmtidx(int argc, char **argv) // cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; // cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; - ostringstream oss_kmer_bit; - oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit"; - uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len); +// ostringstream oss_kmer_bit; +// oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit"; +// uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len); // 对比bwt和fmt搜索的结果 #if CMP_FMT_BWT_RESULT @@ -1292,25 +1177,11 @@ int main_fmtidx(int argc, char **argv) { bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); - // bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]); bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]); - // bwtintv_t p2 = fmt_search_use_full_entry(fmt, seq_arr[i]); if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) cout << seq_arr[i] << endl << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; - - uint64_t qbit = 0; - for (int k = 0; k < kmer_bit_len; ++k) - { - uint64_t base = bval(seq_arr[i][k]); - qbit |= base << ((kmer_bit_len - 1 - k) << 1); - } - uint32_t pos = qbit >> 4; - uint32_t bit_pos = qbit & 0xf; - //cout << pos << '\t' << bit_pos << '\t' << kbit[pos] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl; - if ((p2.x[2] >0) != ((kbit[pos] & (1 << bit_pos)) > 0)) - cout << "diff: " << seq_arr[i] << '\t' << p2.x[2] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl; } time_cmp = realtime() - time_cmp; cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl; diff --git a/fmt_index.h b/fmt_index.h index cd7a2e3..a17e041 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -5,7 +5,7 @@ #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) -#define FMT_MID_INTV_SHIFT 5 +#define FMT_MID_INTV_SHIFT 6 #define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) @@ -40,6 +40,9 @@ #define KMER_LEN 12 #define KMER_ARR_SIZE (1 << (KMER_LEN << 1)) + +#define XMER_LEN 14 + // 用来保存kmer对应的fmt的位置信息 struct KmerEntry {