From 46822bef2c2cf3d0aa40e4a924a72a817ed1417e Mon Sep 17 00:00:00 2001 From: zzh Date: Sat, 3 Feb 2024 15:54:52 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E4=BA=86kmer=20index?= =?UTF-8?q?=EF=BC=8C=E6=9C=89=E6=95=88=E6=9E=9C=EF=BC=8C=E4=BD=86=E6=98=AF?= =?UTF-8?q?=E9=9A=8F=E7=9D=80=E5=BA=8F=E5=88=97=E6=89=A9=E5=B1=95=E5=88=B0?= =?UTF-8?q?=E5=90=8E=E8=BE=B9=EF=BC=8C=E6=89=A9=E5=B1=95=E7=9A=84=E8=80=97?= =?UTF-8?q?=E6=97=B6=E8=B6=8A=E6=9D=A5=E8=B6=8A=E9=95=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bwt.cpp | 17 +++++++++++++++++ fmt_index.cpp | 32 +++++++++++++++++++++++++++----- fmt_index.h | 4 ++-- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/bwt.cpp b/bwt.cpp index 942d73f..939d53d 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -251,9 +251,25 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q) bwtintv_t ik, ok[4]; int i, c, x = 0; + //double tt = realtime(); + bwt_set_intv(bwt, bval(q[x]), ik); ik.info = x + 1; + for (i = x + 1; i < 12; ++i) + { + if (bval(q[i]) < 4) + { + c = cbval(q[i]); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[c]; + ik.info = i + 1; + } + } + //t5 += realtime() - tt; + x = 11; + + //tt = realtime(); for (i = x + 1; i < (int)q.size(); ++i) { if (bval(q[i]) < 4) @@ -264,6 +280,7 @@ bwtintv_t bwt_search(bwt_t *bwt, const string &q) ik.info = i + 1; } } + //t6 += realtime() - tt; return ik; } diff --git a/fmt_index.cpp b/fmt_index.cpp index c90ae37..2606d65 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -514,9 +514,28 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) int i, c1, c2, x = 0; int qlen = (int)q.size(); + //double tt = realtime(); fmt_set_intv(fmt, bval(q[x]), ik); ik.info = x + 1; + for (i = x + 1; i + 1 < 12; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + ik = ok; + ik.info = i + 1; + } + else + { + break; + } + } + x = 11; + //t3 += realtime() - tt; + //tt = realtime(); // 每次扩展两个碱基 for (i = x + 1; i + 1 < qlen; i += 2) { @@ -540,6 +559,7 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) ik = ok; ik.info = i + 1; } + //t4 += realtime() - tt; return ik; } @@ -638,15 +658,15 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) // 先利用kmer进行索引查询 uint32_t qbit = 0; x = min(KMER_LEN, qlen); - double tt = realtime(); + //double tt = realtime(); for (i = 0; i < x; ++i) { qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1); } kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1); - t1 += realtime() - tt; + //t1 += realtime() - tt; - tt = realtime(); + //tt = realtime(); // 每次扩展两个碱基 for (i = x; i + 1 < qlen; i += 2) { @@ -670,7 +690,7 @@ bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) ik = ok; ik.info = i + 1; } - t2 += realtime() - tt; + //t2 += realtime() - tt; return ik; } @@ -778,7 +798,7 @@ int main_fmtidx(int argc, char **argv) // 生成随机序列进行测试 int seq_size = 10000000; // int seq_size = 1; - int seq_len = 19; + int seq_len = 23; vector seq_arr(seq_size); // seq_arr[0] = "GCGATACTAAGA"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; @@ -812,6 +832,8 @@ int main_fmtidx(int argc, char **argv) cout << "[time fmt kmer search:] " << time_fmt_kmer_search << "s" << endl; cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; + cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; + cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; #endif diff --git a/fmt_index.h b/fmt_index.h index 68cfceb..07114c7 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -5,7 +5,7 @@ #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) -#define FMT_MID_INTV_SHIFT 4 +#define FMT_MID_INTV_SHIFT 5 #define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) @@ -38,7 +38,7 @@ #define __fmt_mid_sum(x) \ ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) -#define KMER_LEN 10 +#define KMER_LEN 12 #define KMER_ARR_SIZE ((1 << (KMER_LEN << 1))) // 用来保存kmer对应的fmt的位置信息 struct KmerEntry