From b0cfa3630ea5c36c6d1e59259c7fa6b2ce185aec Mon Sep 17 00:00:00 2001 From: zzh Date: Tue, 13 Feb 2024 11:39:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86kmer=5Fbit=EF=BC=8C?= =?UTF-8?q?=E7=94=A8=E6=9D=A5=E6=A3=80=E6=B5=8B=E7=89=B9=E5=AE=9A=E9=95=BF?= =?UTF-8?q?=E5=BA=A6=E7=9A=84kmer=E6=9C=89=E6=B2=A1=E6=9C=89=E5=AF=B9?= =?UTF-8?q?=E5=BA=94=E7=9A=84fm-index=E5=8C=B9=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 2 +- bwt.h | 4 + fmt_index.cpp | 415 ++++++++++++++++++++++++++++++++++++++++---- fmt_index.h | 22 +++ 4 files changed, 408 insertions(+), 35 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 22ae4d4..af194e2 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -8,7 +8,7 @@ "name": "fmtidx", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/fmtidx", + "program": "${workspaceRoot}/bwa_perf", "args": [ "/home/zzh/reference/human_g1k_v37_decoy.fasta" ], diff --git a/bwt.h b/bwt.h index 5a40b25..6034225 100644 --- a/bwt.h +++ b/bwt.h @@ -61,4 +61,8 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q); // 每次扩展两步 bwtintv_t bwt_search2(bwt_t *bwt, const string &q); +void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2); + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); + #endif \ No newline at end of file diff --git a/fmt_index.cpp b/fmt_index.cpp index caeaf8e..778e37d 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -195,6 +195,46 @@ FullEntry *restore_full_entry_idx(const char *fn) err_fclose(fp); return full_entry; } + +// 将full kmer hash数据写入到文件 +void dump_xmer_idx(const char *fn, const XmerHash *xh) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(xh->xe10, 1, (1 << (10 << 1)) * sizeof(XmerEntryArr), fp); + err_fwrite(xh->xe11, 1, (1 << (11 << 1)) * sizeof(XmerEntry), fp); + err_fwrite(xh->xe12, 1, (1 << (12 << 1)) * sizeof(XmerEntry), fp); + err_fwrite(xh->xe13, 1, (1 << (13 << 1)) * sizeof(XmerEntry), fp); + err_fwrite(xh->xe14, 1, (1 << (14 << 1)) * sizeof(XmerEntry), fp); + err_fflush(fp); + err_fclose(fp); +} + +// 从文件中读取xmer hash信息 +XmerHash restore_xmer_idx(const char *fn) +{ + FILE *fp = xopen(fn, "rb"); + XmerHash xhash; + XmerHash *xh = &xhash; + int len = 1 << (10 << 1); + xh->xe10 = (XmerEntryArr *)malloc(len * sizeof(XmerEntryArr)); + fread_fix(fp, len * sizeof(XmerEntryArr), xh->xe10); + len = 1 << (11 << 1); + xh->xe11 = (XmerEntry *)malloc(len * sizeof(XmerEntry)); + fread_fix(fp, len * sizeof(XmerEntry), xh->xe11); + len = 1 << (12 << 1); + xh->xe12 = (XmerEntry *)malloc(len * sizeof(XmerEntry)); + fread_fix(fp, len * sizeof(XmerEntry), xh->xe12); + len = 1 << (13 << 1); + xh->xe13 = (XmerEntry *)malloc(len * sizeof(XmerEntry)); + fread_fix(fp, len * sizeof(XmerEntry), xh->xe13); + len = 1 << (14 << 1); + xh->xe14 = (XmerEntry *)malloc(len * sizeof(XmerEntry)); + fread_fix(fp, len * sizeof(XmerEntry), xh->xe14); + err_fclose(fp); + return xhash; +} + // 从文件中读取fmt结构数据 FMTIndex *restore_fmt(const char *fn) { @@ -398,7 +438,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) } // 扩展两个个碱基,计算bwt base为b的pre-bwt str中各个碱基的occ -void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) +inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]) { uint32_t x = 0; uint32_t *p, *q, tmp; @@ -694,9 +734,8 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) } kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1); //t1 += realtime() - tt; - - //tt = realtime(); - // 每次扩展两个碱基 + // tt = realtime(); + // 每次扩展两个碱基 for (i = x; i + 1 < qlen; i += 2) { if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) @@ -726,10 +765,10 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) } // 利用fmt搜索seed,利用full entry加速搜索过程 -bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q) +bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, bwt_t *bwt, const string &q) { bwtintv_t ik; - bwtintv_t ok; + int i, c1, c2, x = 0; int qlen = (int)q.size(); @@ -742,6 +781,244 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q) } full_entry_get(&fmt->full_entry[qbit], &ik); +#if 0 + bwtintv_t ok; + // 每次扩展两个碱基 + for (i = x; i + 1 < qlen; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + ik = ok; + ik.info = i + 1; + } + else + { + break; + } + } + if (i < qlen && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + fmt_extend1(fmt, &ik, &ok, 0, c1); + ik = ok; + ik.info = i + 1; + } +#else + bwtintv_t ok[4]; + for (i = x; i < qlen; ++i) + { + if (bval(q[i]) < 4) + { + c1 = cbval(q[i]); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[c1]; + ik.info = i + 1; + } + } + +#endif + + return ik; +} + +// 将用字符串表示的序列计算成用2bit表示的序列 +uint32_t str2bit(string &str) +{ + uint32_t pac = 0; + for (int i = 0; i < 16; ++i) + pac = (pac << 2) | bval(str[i]); + return pac; +} + +// 生成所有KMER_LEN长度的序列,字符串表示 +void gen_all_seq(vector &vseq, int kmer_len) +{ + uint64_t seq_up_val = (1 << (kmer_len << 1)); + vseq.clear(); + vseq.resize(seq_up_val); + for (uint64_t i = 0; i < seq_up_val; ++i) + { + for (int j = kmer_len - 1; j >= 0; --j) + { + vseq[i].push_back(BASE[(i >> (j << 1)) & 3]); + } + } +} + +// 获取xmer的fmt匹配信息 +inline void xmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos = 0) +{ + bwtint_t x0, x1, x2; + int byte_idx = pos * 14; + uint8_t *arr = mem_addr + byte_idx; + x0 = *arr; + x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x1 = *arr; + x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x2 = *((uint32_t *)arr); + ok->x[0] = x0; + ok->x[1] = x1; + ok->x[2] = x2; +} + +// 设置xmer第pos个碱基对应的fmt匹配信息 +inline void xmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos = 0) +{ + int byte_idx = pos * 14; + uint8_t *arr = mem_addr + byte_idx; + arr[0] = (uint8_t)(ik.x[0] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; + arr += 5; + arr[0] = (uint8_t)(ik.x[1] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; + arr += 5; + *((uint32_t *)arr) = (uint32_t)ik.x[2]; +} + +// 创建xmer hash索引 +void create_xmer_index(FMTIndex *fmt) +{ + vector qarr; + gen_all_seq(qarr, 10); + bwtintv_t ik; + int i, j, c1, c2; + int qlen = 10; + bwtint_t tk[4], tl[4]; + XmerHash *xh = &fmt->xmer_hash; + + cout << "qlen: " << qlen << endl; + + xh->xe10 = (XmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(XmerEntryArr)); + xh->xe11 = (XmerEntry *)malloc((1 << (11 << 1)) * sizeof(XmerEntry)); + xh->xe12 = (XmerEntry *)malloc((1 << (12 << 1)) * sizeof(XmerEntry)); + xh->xe13 = (XmerEntry *)malloc((1 << (13 << 1)) * sizeof(XmerEntry)); + xh->xe14 = (XmerEntry *)malloc((1 << (14 << 1)) * sizeof(XmerEntry)); + + // 长度为10的kmer + for (j = 0; j < (int)qarr.size(); ++j) + { + string &q = qarr[j]; + uint8_t *mem_addr = xh->xe10[j].intv_arr; + fmt_set_intv(fmt, bval(q[0]), ik); + xmer_setval_at(mem_addr, ik, 0); + + // 每次扩展两个碱基 + for (i = 1; i + 1 < qlen; i += 2) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + xmer_setval_at(mem_addr, ik, i); + + // 第二次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2]; + ik.x[1] = fmt->L2[c2] + 1 + tk[3]; + ik.x[2] = tl[3] - tk[3]; + xmer_setval_at(mem_addr, ik, i + 1); + } + { // 最后一次扩展 + c1 = cbval(q[i]); + c2 = 3; + fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); + fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); + // 第一次扩展的结果 + ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; + ik.x[1] = fmt->L2[c1] + 1 + tk[1]; + ik.x[2] = tl[1] - tk[1]; + xmer_setval_at(mem_addr, ik, i); + } + } + + // 长度11的kmer + gen_all_seq(qarr, 11); + for (j = 0; j < (int)qarr.size(); ++j) + { + bwtintv_t p = fmt_search(fmt, qarr[j]); + xmer_setval_at(fmt->xmer_hash.xe11[j].intv_arr, p); + } + + // 长度12的kmer + gen_all_seq(qarr, 12); + for (j = 0; j < (int)qarr.size(); ++j) + { + bwtintv_t p = fmt_search(fmt, qarr[j]); + xmer_setval_at(fmt->xmer_hash.xe12[j].intv_arr, p); + } + //string q = "CACATATTGGCG"; + //uint32_t qbit = 0; + //for (i = 0; i < 12; ++i) + //{ + // qbit |= bval(q[i]) << ((11 - i) << 1); + //} + //cout << "qbit: " << qbit << endl; + //xmer_getval_at(fmt->xmer_hash.xe12[qbit].intv_arr, &ik); + + // 长度13的kmer + gen_all_seq(qarr, 13); + for (j = 0; j < (int)qarr.size(); ++j) + { + bwtintv_t p = fmt_search(fmt, qarr[j]); + xmer_setval_at(fmt->xmer_hash.xe13[j].intv_arr, p); + } + + // 长度14的kmer + gen_all_seq(qarr, 14); + for (j = 0; j < (int)qarr.size(); ++j) + { + bwtintv_t p = fmt_search(fmt, qarr[j]); + xmer_setval_at(fmt->xmer_hash.xe14[j].intv_arr, p); + } +} + +// 利用fmt搜索seed,利用xmer加速搜索过程 +bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, const string &q) +{ + bwtintv_t ik; + bwtintv_t ok; + int i, c1, c2, x = 0; + int qlen = (int)q.size(); + + int max_xmer_len = 14; + // 先利用kmer进行索引查询 + uint32_t qbit = 0; + x = min(max_xmer_len, qlen); + for (i = 0; i < x; ++i) + { + qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1); + } + if (x == 14) + { + xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik); + } + else if (x == 13) + { + xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik); + } + else if (x == 12) + { + // cout << "qbit: " << qbit << endl; + xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik); + } + else if (x == 11) + { + xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik); + } + else + { + xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1); + } + // 每次扩展两个碱基 for (i = x; i + 1 < qlen; i += 2) { @@ -768,43 +1045,70 @@ bwtintv_t fmt_search_use_full_entry(FMTIndex *fmt, const string &q) return ik; } -// 将用字符串表示的序列计算成用2bit表示的序列 -uint32_t str2bit(string &str) +uint16_t *restore_kmer_bit(const char *fn, int kmer_len) { - uint32_t pac = 0; - for (int i = 0; i < 16; ++i) - pac = (pac << 2) | bval(str[i]); - return pac; + uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); + FILE *fp; + fp = xopen(fn, "rb"); + fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit); + err_fclose(fp); + return kbit; } -// 生成所有KMER_LEN长度的序列,字符串表示 -void gen_all_seq(vector &vseq, int kmer_len) +void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len) { - uint32_t seq_up_val = (1 << (kmer_len << 1)); - vseq.clear(); - vseq.resize(seq_up_val); - for (uint32_t i = 0; i < seq_up_val; ++i) + uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); + ostringstream oss; + oss << '.' << kmer_len << ".kmer.bit"; + string kmer_bit = prefix + oss.str(); + cout << kmer_bit << endl; + uint64_t seq_up_val = (1L << (kmer_len << 1)); + cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl; + string seq; + seq.resize(kmer_len); + int percent = 1; + double tt = realtime(); + for (uint64_t i = 0; i < seq_up_val; ++i) { + if ((i+1) % (seq_up_val / 100) == 0) + { + tt = realtime() - tt; + cout << percent++ << "\% finished, " << tt << "s elapsed." << endl; + tt = realtime(); + } for (int j = kmer_len - 1; j >= 0; --j) { - vseq[i].push_back(BASE[(i >> (j << 1)) & 3]); + seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; + } + uint32_t pos = i >> 4; + uint32_t bit_pos = i & 0xf; + bwtintv_t p = fmt_search_use_xmer(fmt, seq); + if (p.x[2] > 0) { + kbit[pos] |= 1 << bit_pos; } } + FILE *fp; + fp = xopen(kmer_bit.c_str(), "wb"); + err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp); + err_fflush(fp); + err_fclose(fp); } #define GEN_BWT_IDX 0 #define GEN_FMT_IDX 0 #define GEN_KMER_IDX 0 +#define GEN_XMER_IDX 0 #define GEN_FULL_ENTRY_IDX 0 -#define CMP_FMT_BWT_TIME 1 +#define CMP_FMT_BWT_TIME 0 #define CMP_FMT_BWT_RESULT 1 // argv[1] 应该是索引的前缀 int main_fmtidx(int argc, char **argv) { + //cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl; string prefix = argv[1]; string bwt_str = prefix + ".bwt.str"; - string bwt_idx, fmt_idx, kmer_idx, full_entry_idx; + string bwt_idx, fmt_idx, kmer_idx, full_entry_idx, xmer_idx; bwt_t *bwt; FMTIndex *fmt; ostringstream oss_bwt, oss_fmt, oss_kmer, oss_full_entry; @@ -820,11 +1124,14 @@ int main_fmtidx(int argc, char **argv) fmt_idx = prefix + oss_fmt.str() + ".fmt"; kmer_idx = prefix + oss_kmer.str() + ".kmer"; full_entry_idx = prefix + oss_full_entry.str() + ".full_entry"; + xmer_idx = prefix + ".14.xmer"; cout << bwt_idx << endl; cout << fmt_idx << endl; cout << kmer_idx << endl; cout << full_entry_idx << endl; + cout << xmer_idx << endl; + // 生成或读取bwt索引文件 double time_read_bwt = realtime(); @@ -896,35 +1203,53 @@ int main_fmtidx(int argc, char **argv) time_gen_full_entry = realtime() - time_gen_full_entry; cout << "[time gen full kmer entry:] " << time_gen_full_entry << "s" << endl; +// 生成xmer索引文件 + double time_gen_xmer_hash = realtime(); +#if GEN_XMER_IDX + create_xmer_index(fmt); + dump_xmer_idx(xmer_idx.c_str(), &fmt->xmer_hash); +#else + fmt->xmer_hash = restore_xmer_idx(xmer_idx.c_str()); +#endif + time_gen_xmer_hash = realtime() - time_gen_xmer_hash; + cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl; + + int kmer_bit_len = 18; + //create_kmer_bit(fmt, prefix, kmer_bit_len); + // return 0; + // 生成随机序列进行测试 int seq_size = 10000000; //int seq_size = 1; - int seq_len = 14; + int seq_len = kmer_bit_len; vector seq_arr(seq_size); - // seq_arr[0] = "GCGATACTAAGA"; + seq_arr[0] = "CACATATTGGCG"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; - seq_arr[0] = "ACCTTTCGAATTGA"; + // seq_arr[0] = "ACCTTTCGAATTGA"; #if 1 srand(time(NULL)); double time_gen_seq = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) seq_arr[i] = generate_rand_seq(seq_len); time_gen_seq = realtime() - time_gen_seq; - cout << "[time gen seq:] " << time_gen_seq << "s" << endl; + cout << "[time gen seq:] " << time_gen_seq << "s" << endl; #endif // 对比bwt和fmt搜索的时间 - bwtint_t max_gap = 0; + #if CMP_FMT_BWT_TIME - // int nintv_0 = 0; + int nintv_0 = 0; + //bwtint_t max_gap = 0; double time_bwt_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) { bwtintv_t p = bwt_search(bwt, seq_arr[i]); - if (max_gap < p.x[2]) - max_gap = p.x[2]; + //if (max_gap < p.x[2]) + // max_gap = p.x[2]; + if (p.x[2] == 0) + ++nintv_0; } - // cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; - cout << "max gap: " << max_gap << endl; + cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; + //cout << "max gap: " << max_gap << endl; time_bwt_search = realtime() - time_bwt_search; cout << "[time bwt search:] " << time_bwt_search << "s" << endl; @@ -942,15 +1267,23 @@ int main_fmtidx(int argc, char **argv) double time_fmt_full_entry_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) - fmt_search_use_full_entry(fmt, seq_arr[i]); + fmt_search_use_full_entry(fmt, bwt, seq_arr[i]); time_fmt_full_entry_search = realtime() - time_fmt_full_entry_search; cout << "[time fmt full entry search:] " << time_fmt_full_entry_search << "s" << endl; + double time_xmer_search = realtime(); + for (int i = 0; i < (int)seq_arr.size(); ++i) + fmt_search_use_xmer(fmt, seq_arr[i]); + time_xmer_search = realtime() - time_xmer_search; + cout << "[time xmer search:] " << time_xmer_search << "s" << endl; +#endif // cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; // cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; // cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; -#endif + ostringstream oss_kmer_bit; + oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit"; + uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len); // 对比bwt和fmt搜索的结果 #if CMP_FMT_BWT_RESULT @@ -959,11 +1292,25 @@ int main_fmtidx(int argc, char **argv) { bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); - bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]); + // bwtintv_t p2 = fmt_search_use_kmer(fmt, seq_arr[i]); + bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]); + // bwtintv_t p2 = fmt_search_use_full_entry(fmt, seq_arr[i]); if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) cout << seq_arr[i] << endl << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; + + uint64_t qbit = 0; + for (int k = 0; k < kmer_bit_len; ++k) + { + uint64_t base = bval(seq_arr[i][k]); + qbit |= base << ((kmer_bit_len - 1 - k) << 1); + } + uint32_t pos = qbit >> 4; + uint32_t bit_pos = qbit & 0xf; + //cout << pos << '\t' << bit_pos << '\t' << kbit[pos] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl; + if ((p2.x[2] >0) != ((kbit[pos] & (1 << bit_pos)) > 0)) + cout << "diff: " << seq_arr[i] << '\t' << p2.x[2] << '\t' << (kbit[pos] & (1 << bit_pos)) << endl; } time_cmp = realtime() - time_cmp; cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl; diff --git a/fmt_index.h b/fmt_index.h index f002302..cd7a2e3 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -55,6 +55,27 @@ struct FullEntry uint8_t intv_arr[14]; }; + +// x-mer entry, 1 - 14个碱基组成的hash key +struct XmerEntry +{ + uint8_t intv_arr[14]; +}; + +struct XmerEntryArr +{ + uint8_t intv_arr[140]; +}; + +struct XmerHash +{ + XmerEntryArr *xe10; + XmerEntry *xe11; + XmerEntry *xe12; + XmerEntry *xe13; + XmerEntry *xe14; +}; + // fm-index, extend twice in one search step (one memory access) struct FMTIndex { @@ -72,6 +93,7 @@ struct FMTIndex // 保存kmer对应的fmt位置信息 KmerEntry *kmer_entry; FullEntry *full_entry; + XmerHash xmer_hash; // suffix array int sa_intv; bwtint_t n_sa;