From 914cbd34abb5775373f0b49cf0147baa5c383152 Mon Sep 17 00:00:00 2001 From: zzh Date: Sat, 3 Feb 2024 10:06:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86kmer=20index?= =?UTF-8?q?=EF=BC=8C=E6=9C=89=E6=95=88=E6=9E=9C=EF=BC=8C=E4=BD=86=E6=98=AF?= =?UTF-8?q?=E4=B8=8D=E6=98=8E=E6=98=BE=EF=BC=8C=E6=9C=89=E5=8F=AF=E8=83=BD?= =?UTF-8?q?=E6=98=AF=E9=9A=8F=E7=9D=80=E5=BA=8F=E5=88=97=E7=9A=84=E8=BE=B9?= =?UTF-8?q?=E9=95=BF=EF=BC=8C=E5=90=8E=E8=BE=B9=E7=9A=84=E6=89=A9=E5=B1=95?= =?UTF-8?q?=E6=9B=B4=E8=80=97=E6=97=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bwt.cpp | 3 +- bwt.h | 2 +- fmt_index.cpp | 238 ++++++++++++++++++++++++++++++++++++++++++++++---- fmt_index.h | 13 ++- util.cpp | 11 +++ util.h | 2 + 6 files changed, 251 insertions(+), 18 deletions(-) diff --git a/bwt.cpp b/bwt.cpp index 1177ef6..942d73f 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -246,7 +246,7 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b } // 利用bwt搜索seed,完整搜索,只需要单向搜索 -void bwt_search(bwt_t *bwt, const string &q) +bwtintv_t bwt_search(bwt_t *bwt, const string &q) { bwtintv_t ik, ok[4]; int i, c, x = 0; @@ -264,6 +264,7 @@ void bwt_search(bwt_t *bwt, const string &q) ik.info = i + 1; } } + return ik; } // 扩展两次 diff --git a/bwt.h b/bwt.h index d13d900..5a40b25 100644 --- a/bwt.h +++ b/bwt.h @@ -57,7 +57,7 @@ bwt_t *restore_bwt(const char *fn); void create_interval_occ_bwt(bwt_t *bwt); void dump_bwt(const char *fn, const bwt_t *bwt); // 利用bwt搜索seed,完整搜索,只需要单向搜索 -void bwt_search(bwt_t *bwt, const string &q); +bwtintv_t bwt_search(bwt_t *bwt, const string &q); // 每次扩展两步 bwtintv_t bwt_search2(bwt_t *bwt, const string &q); diff --git a/fmt_index.cpp b/fmt_index.cpp index 365c137..c90ae37 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -138,6 +138,7 @@ void fmt_gen_cnt_table(uint32_t cnt_table[4][256]) } } +// 将fmt结构数据写入到二进制文件 void dump_fmt(const char *fn, const FMTIndex *fmt) { FILE *fp; @@ -153,25 +154,47 @@ void dump_fmt(const char *fn, const FMTIndex *fmt) err_fclose(fp); } +// 将kmer hash数据写入到文件 +void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp); + err_fflush(fp); + err_fclose(fp); +} + +// 从文件中读取kmer hash信息 +KmerEntry *restore_kmer_idx(const char *fn) +{ + FILE *fp = xopen(fn, "rb"); + uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry); + KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes); + fread_fix(fp, kmer_bytes, kmer_entry); + err_fclose(fp); + return kmer_entry; +} + +// 从文件中读取fmt结构数据 FMTIndex *restore_fmt(const char *fn) { FMTIndex *fmt; fmt = (FMTIndex *)calloc(1, sizeof(FMTIndex)); - FILE *fp = fopen(fn, "rb"); + FILE *fp = xopen(fn, "rb"); fseek(fp, 0, SEEK_END); fmt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 6 - 3) >> 2; // 以32位word为单位计算的size fmt->bwt = (uint32_t *)calloc(fmt->bwt_size, 4); fseek(fp, 0, SEEK_SET); - fread(&fmt->primary, sizeof(bwtint_t), 1, fp); - fread(&fmt->sec_primary, sizeof(bwtint_t), 1, fp); - fread(&fmt->sec_bcp, sizeof(uint8_t), 1, fp); - fread(&fmt->first_base, sizeof(uint8_t), 1, fp); - fread(&fmt->last_base, sizeof(uint8_t), 1, fp); - fread(fmt->L2 + 1, sizeof(bwtint_t), 4, fp); + err_fread_noeof(&fmt->primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&fmt->sec_primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&fmt->sec_bcp, sizeof(uint8_t), 1, fp); + err_fread_noeof(&fmt->first_base, sizeof(uint8_t), 1, fp); + err_fread_noeof(&fmt->last_base, sizeof(uint8_t), 1, fp); + err_fread_noeof(fmt->L2 + 1, sizeof(bwtint_t), 4, fp); fread_fix(fp, fmt->bwt_size << 2, fmt->bwt); fmt->seq_len = fmt->L2[4]; - fclose(fp); + err_fclose(fp); fmt_gen_cnt_occ(fmt); // 字节所能表示的各种碱基组合中,各个碱基的累积数量 return fmt; } @@ -520,6 +543,138 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) return ik; } +// 获取kmer的fmt匹配信息 +inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos) +{ + bwtint_t x0, x1, x2; + int byte_idx = pos * 14; + uint8_t *arr = ke->intv_arr + byte_idx; + x0 = *arr; + x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x1 = *arr; + x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x2 = *((uint32_t *)arr); + ok->x[0] = x0; + ok->x[1] = x1; + ok->x[2] = x2; +} + +// 设置kmer第pos个碱基对应的fmt匹配信息 +inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos) +{ + int byte_idx = pos * 14; + uint8_t *arr = ke->intv_arr + byte_idx; + arr[0] = (uint8_t)(ik.x[0] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; + arr += 5; + arr[0] = (uint8_t)(ik.x[1] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; + arr += 5; + *((uint32_t *)arr) = (uint32_t)ik.x[2]; +} + +// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息 +void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke) +{ + bwtintv_t ik; + int i, c1, c2; + int qlen = (int)q.size(); + bwtint_t tk[4], tl[4]; + + fmt_set_intv(fmt, bval(q[0]), ik); + kmer_setval_at(&ke, ik, 0); + + // 每次扩展两个碱基 + for (i = 1; i + 1 < qlen; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + kmer_setval_at(&ke, ik, i); + + // 第二次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2]; + ik.x[1] = fmt->L2[c2] + 1 + tk[3]; + ik.x[2] = tl[3] - tk[3]; + kmer_setval_at(&ke, ik, i + 1); + } + else + { + break; + } + } + if (i < qlen && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + c2 = 3; + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + kmer_setval_at(&ke, ik, i); + } +} + +// 利用fmt搜索seed,利用kmer加速搜索过程 +bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) +{ + bwtintv_t ik; + bwtintv_t ok; + int i, c1, c2, x = 0; + int qlen = (int)q.size(); + + // 先利用kmer进行索引查询 + uint32_t qbit = 0; + x = min(KMER_LEN, qlen); + double tt = realtime(); + for (i = 0; i < x; ++i) + { + qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1); + } + kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1); + t1 += realtime() - tt; + + tt = realtime(); + // 每次扩展两个碱基 + for (i = x; i + 1 < qlen; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + ik = ok; + ik.info = i + 1; + } + else + { + break; + } + } + if (i < qlen && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + fmt_extend1(fmt, &ik, &ok, 0, c1); + ik = ok; + ik.info = i + 1; + } + t2 += realtime() - tt; + return ik; +} + +// 将用字符串表示的序列计算成用2bit表示的序列 uint32_t str2bit(string &str) { uint32_t pac = 0; @@ -528,8 +683,24 @@ uint32_t str2bit(string &str) return pac; } +// 生成所有KMER_LEN长度的序列,字符串表示 +void gen_all_seq(vector &vseq, int kmer_len) +{ + uint32_t seq_up_val = (1 << (kmer_len << 1)); + vseq.clear(); + vseq.resize(seq_up_val); + for (uint32_t i = 0; i < seq_up_val; ++i) + { + for (int j = kmer_len - 1; j >= 0; --j) + { + vseq[i].push_back(BASE[(i >> (j << 1)) & 3]); + } + } +} + #define GEN_BWT_IDX 0 #define GEN_FMT_IDX 0 +#define GEN_KMER_IDX 0 #define CMP_FMT_BWT_TIME 1 #define CMP_FMT_BWT_RESULT 1 @@ -538,21 +709,24 @@ int main_fmtidx(int argc, char **argv) { string prefix = argv[1]; string bwt_str = prefix + ".bwt.str"; - string bwt_idx, fmt_idx; + string bwt_idx, fmt_idx, kmer_idx; bwt_t *bwt; FMTIndex *fmt; - ostringstream oss_bwt, oss_fmt; + ostringstream oss_bwt, oss_fmt, oss_kmer; oss_bwt << '.' << OCC_INTERVAL; oss_fmt << '.' << FMT_OCC_INTERVAL; + oss_kmer << '.' << KMER_LEN; #ifdef FMT_MID_INTERVAL oss_fmt << '.' << FMT_MID_INTERVAL; #endif bwt_idx = prefix + oss_bwt.str() + ".bwt"; fmt_idx = prefix + oss_fmt.str() + ".fmt"; + kmer_idx = prefix + oss_kmer.str() + ".kmer"; cout << bwt_idx << endl; cout << fmt_idx << endl; + cout << kmer_idx << endl; // 生成或读取bwt索引文件 double time_read_bwt = realtime(); @@ -577,10 +751,34 @@ int main_fmtidx(int argc, char **argv) time_read_fmt = realtime() - time_read_fmt; cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl; -// 生成随机序列进行测试 +// 生成或读取kmer信息 + vector vkmer; + double time_gen_kmer = realtime(); +#if GEN_KMER_IDX + gen_all_seq(vkmer, KMER_LEN); + fmt->kmer_entry = (KmerEntry*)malloc(KMER_ARR_SIZE * sizeof(KmerEntry)); + for (size_t i = 0; i < vkmer.size(); ++i) + { + fmt_search_store_kmer(fmt, vkmer[i], fmt->kmer_entry[i]); + bwtintv_t p1; + kmer_getval_at(&fmt->kmer_entry[i], &p1, KMER_LEN - 1); + bwtintv_t p2 = bwt_search(bwt, vkmer[i]); + if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) + cout << vkmer[i] << endl + << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl + << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; + } + dump_kmer_idx(kmer_idx.c_str(), fmt->kmer_entry); +#else + fmt->kmer_entry = restore_kmer_idx(kmer_idx.c_str()); +#endif + time_gen_kmer = realtime() - time_gen_kmer; + cout << "[time gen kmer:] " << time_gen_kmer << "s" << endl; + + // 生成随机序列进行测试 int seq_size = 10000000; // int seq_size = 1; - int seq_len = 11; + int seq_len = 19; vector seq_arr(seq_size); // seq_arr[0] = "GCGATACTAAGA"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; @@ -606,15 +804,25 @@ int main_fmtidx(int argc, char **argv) fmt_search(fmt, seq_arr[i]); time_fmt_search = realtime() - time_fmt_search; cout << "[time fmt search:] " << time_fmt_search << "s" << endl; + + double time_fmt_kmer_search = realtime(); + for (int i = 0; i < (int)seq_arr.size(); ++i) + fmt_search_use_kmer(fmt, seq_arr[i]); + time_fmt_kmer_search = realtime() - time_fmt_kmer_search; + cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl; + + cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; + #endif // 对比bwt和fmt搜索的结果 #if CMP_FMT_BWT_RESULT - double time_cmp = realtime(); + double time_cmp = realtime(); for (int i = 0; i < (int)seq_arr.size() / 100; ++i) { bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); - bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); + // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); + bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]); if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) cout << seq_arr[i] << endl << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl @@ -625,4 +833,4 @@ int main_fmtidx(int argc, char **argv) #endif return 0; -} \ No newline at end of file +} diff --git a/fmt_index.h b/fmt_index.h index 8d525a2..68cfceb 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -5,7 +5,7 @@ #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) -#define FMT_MID_INTV_SHIFT 5 +#define FMT_MID_INTV_SHIFT 4 #define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) @@ -38,6 +38,15 @@ #define __fmt_mid_sum(x) \ ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) +#define KMER_LEN 10 +#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1))) +// 用来保存kmer对应的fmt的位置信息 +struct KmerEntry +{ + // 40+40+32 14个byte,这样好处理 + uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 +}; + // fm-index, extend twice in one search step (one memory access) struct FMTIndex { @@ -52,6 +61,8 @@ struct FMTIndex uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 uint8_t last_base; // dollar转换成的base + // 保存kmer对应的fmt位置信息 + KmerEntry *kmer_entry; // suffix array int sa_intv; bwtint_t n_sa; diff --git a/util.cpp b/util.cpp index 3be7894..637fa42 100644 --- a/util.cpp +++ b/util.cpp @@ -115,6 +115,17 @@ size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) return ret; } +// 读取单个数据 +size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fread(ptr, size, nmemb, stream); + if (ret != nmemb) + { + _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); + } + return ret; +} + // 刷新文件流 int err_fflush(FILE *stream) { diff --git a/util.h b/util.h index 6d061c2..aa608d6 100644 --- a/util.h +++ b/util.h @@ -23,6 +23,8 @@ struct bwtintv_t bwtint_t x[3], info; // x[0]表示正链位置,x[1]表示互补链位置,x[2]表示间隔长度,info 表示read的起始结束位置 }; +// 读取单个数据 +size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); // 读取二进制数据 bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a); // 给出问题信息并终止程序