From 98ae5bb7790df893a294872874e763f77eb6d50f Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 1 Apr 2024 21:32:48 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B6=88=E8=9E=8D=E5=AE=9E=E9=AA=8C=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bwt.cpp | 22 ++ bwt.h | 7 +- fmt_index.cpp | 590 ++++++++++++++++++++++++++------------------------ fmt_index.h | 28 +-- util.h | 10 + 5 files changed, 348 insertions(+), 309 deletions(-) diff --git a/bwt.cpp b/bwt.cpp index 942d73f..093acf1 100644 --- a/bwt.cpp +++ b/bwt.cpp @@ -227,6 +227,28 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin } } +// 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式) +void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val) +{ + const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节 + const int val_idx_in_block = k & 7; + const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2); + bwtint_t *sa_addr = (bwtint_t *)(sa_arr + start_byte_idx); + // *sa_addr &= (1 << val_idx_in_block) - 1; // 如果开辟内存的时候清零了,这一步可以省略,会清除后面的数据,只适合按递增顺序赋值 + *sa_addr |= (val & ((1L << 33) - 1)) << val_idx_in_block; +} + +// 获取某一行的排序值(小端模式) +bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k) +{ + const bwtint_t block_idx = (k >> 3) * 33; // 8个数为一组,共享33个字节 + const int val_idx_in_block = k & 7; + const bwtint_t start_byte_idx = block_idx + (val_idx_in_block << 2); + bwtint_t val = *(bwtint_t *)(sa_arr + start_byte_idx); + val = (val >> val_idx_in_block) & 8589934591; + return val; +} + void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) { bwtint_t tk[4], tl[4]; diff --git a/bwt.h b/bwt.h index 6034225..aeee31a 100644 --- a/bwt.h +++ b/bwt.h @@ -7,7 +7,7 @@ using std::string; // occ间隔对应的右移位数,5表示间隔32行保存一次 -#define OCC_INTV_SHIFT 6 +#define OCC_INTV_SHIFT 7 #define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) #define OCC_INTV_MASK (OCC_INTERVAL - 1) @@ -47,6 +47,9 @@ struct bwt_t uint8_t *sa; }; +void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val); +bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k); + // k行(包含,bwt mtx行)之前,碱基c的累计总数, interval大于等于32才能正确计算 bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c); // 统计k行(bwt mtx行,包含k行本身)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1 @@ -65,4 +68,4 @@ void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); -#endif \ No newline at end of file +#endif diff --git a/fmt_index.cpp b/fmt_index.cpp index 36f619d..190c21d 100644 --- a/fmt_index.cpp +++ b/fmt_index.cpp @@ -9,11 +9,14 @@ #include #include #include +#include #include "util.h" #include "bwt.h" #include "fmt_index.h" using namespace std; +using std::cout; +using std::ifstream; double t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0, t10 = 0, t11 = 0, t12 = 0, t13 = 0, t14 = 0; @@ -155,48 +158,6 @@ void dump_fmt(const char *fn, const FMTIndex *fmt) } // 将kmer hash数据写入到文件 -void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry) -{ - FILE *fp; - fp = xopen(fn, "wb"); - err_fwrite(kmer_entry, 1, KMER_ARR_SIZE * sizeof(KmerEntry), fp); - err_fflush(fp); - err_fclose(fp); -} - -// 从文件中读取kmer hash信息 -KmerEntry *restore_kmer_idx(const char *fn) -{ - FILE *fp = xopen(fn, "rb"); - uint32_t kmer_bytes = KMER_ARR_SIZE * sizeof(KmerEntry); - KmerEntry *kmer_entry = (KmerEntry *)malloc(kmer_bytes); - fread_fix(fp, kmer_bytes, kmer_entry); - err_fclose(fp); - return kmer_entry; -} - -// 将full kmer hash数据写入到文件 -void dump_full_entry_idx(const char *fn, const FullEntry *full_entry) -{ - FILE *fp; - fp = xopen(fn, "wb"); - err_fwrite(full_entry, 1, FULL_ENTRY_ARR_SIZE * sizeof(FullEntry), fp); - err_fflush(fp); - err_fclose(fp); -} - -// 从文件中读取full kmer hash信息 -FullEntry *restore_full_entry_idx(const char *fn) -{ - FILE *fp = xopen(fn, "rb"); - uint32_t full_entry_bytes = FULL_ENTRY_ARR_SIZE * sizeof(FullEntry); - FullEntry *full_entry = (FullEntry *)malloc(full_entry_bytes); - fread_fix(fp, full_entry_bytes, full_entry); - err_fclose(fp); - return full_entry; -} - -// 将full kmer hash数据写入到文件 void dump_xmer_idx(const char *fn, const XmerHash *xh) { FILE *fp; @@ -259,6 +220,41 @@ FMTIndex *restore_fmt(const char *fn) return fmt; } +// 读取sa数据 +void fmt_restore_sa(const char *fn, FMTIndex *fmt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + fp = xopen(fn, "rb"); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same."); + err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip + err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv; + fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa)); + fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa); + err_fclose(fp); +} + +// 读取pac ref数据 +void fmt_load_ref_pac(const char *fn_prefix, FMTIndex *fmt) +{ + string ann_fn = string(fn_prefix) + ".ann"; + string pac_fn = string(fn_prefix) + ".pac"; + FILE *fp; + fp = xopen(ann_fn.c_str(), "r"); + fscanf(fp, "%ld", &fmt->l_pac); + err_fclose(fp); + // load pac + fmt->pac = (uint8_t*)calloc(fmt->l_pac / 4 + 1, 1); + fp = xopen(pac_fn.c_str(), "r"); + err_fread_noeof(fmt->pac, 1, fmt->l_pac / 4 + 1, fp); // concatenated 2-bit encoded sequence + err_fclose(fp); +} + // 根据interval-bwt创建fmt-index FMTIndex *create_fmt_from_bwt(bwt_t *bwt) { @@ -534,6 +530,59 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[3] += x >> 24 & 0xff; } +// 这里的k是bwt str的行 +inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_t *b1, uint8_t *b2) +{ + uint32_t *p; + uint8_t base2; + // 第一步,找到check point位置 + p = fmt_occ_intv(fmt, k); // check point起始位置 + p += 20; // bwt碱基起始位置 + // 第二步,找到mid check point位置 + int mk = k & FMT_OCC_INTV_MASK; + int n_mintv = mk >> FMT_MID_INTV_SHIFT; + p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置 + // 第三步,找到具体的uint32_t + p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基) + // 第四步,获取碱基 + base2 = *p >> ((~(k) & 0x7) << 2) & 0xf; + *b2 = base2 >> 2 & 3; + *b1 = base2 & 3; +} + +// k, k1, k2都是bwt矩阵对应的行 +inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) +{ + uint8_t b1, b2; + bwtint_t tk[4], kk; + kk = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1) + fmt_get_previous_base(fmt, kk, &b1, &b2); + fmt_e2_occ(fmt, k, b1, b2, tk); + *k1 = fmt->L2[b1] + tk[1]; + *k2 = fmt->L2[b2] + tk[3]; +} + +bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) +{ + bwtint_t sa = 0, mask = fmt->sa_intv - 1; + bwtint_t k1, k2; + while (k & mask) + { + ++sa; + fmt_previous_line(fmt, k, &k1, &k2); + __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2); + if (!(k1 & mask)) + { + k = k1; + break; + } + ++sa; + k = k2; + } + sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv); + return sa; +} + // 扩展两个碱基 inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1, int b2) { @@ -566,6 +615,67 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; } +// 序列和参考基因直接对比 +static void direct_extend(const FMTIndex *fmt, int len, const char *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt) +{ +#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) +#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \ + while (k != qcond && r != rcond) \ + { \ + const int base = PAC_BASE(fmt->pac, r); \ + if (q[k] != "ACGT"[base]) \ + break; \ + k += qstep; \ + r += rstep; \ + } +#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \ + while (k != qcond && r != rcond) \ + { \ + const int base = 3 - PAC_BASE(fmt->pac, r); \ + if (q[k] != "ACGT"[base]) \ + break; \ + k += qstep; \ + r += rstep; \ + } + + int k; + int64_t r, rp; + mt->num_match = 1; + rp = fmt_sa(fmt, mtx_line); + r = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; + k = right_pos; + if (rp < (int64_t)fmt->l_pac) // 匹配到了正向链 + { + // 向前继续扩展 + r += right_pos - left_pos; + EXTEND_BASE_LOOP(len, (int64_t)fmt->l_pac, 1, 1); + mt->rm[0].qe = k; + mt->rm[0].reverse = 0; + // 向后扩展,x位置之前的碱基 + r -= k - left_pos + 1; + k = left_pos - 1; + EXTEND_BASE_LOOP(-1, (int64_t)-1, -1, -1); + mt->rm[0].qs = k + 1; + mt->rm[0].rs = r + 1; + } + else // 匹配到了互补链 + { + r -= right_pos - left_pos; + EXTEND_BASE_LOOP_COMP(len, (int64_t)-1, 1, -1); + mt->rm[0].qe = k; + mt->rm[0].reverse = 1; + // 扩展x之前的碱基 + r += k - left_pos + 1; + k = left_pos - 1; + EXTEND_BASE_LOOP_COMP(-1, (int64_t)fmt->l_pac, -1, 1); + mt->rm[0].qs = k + 1; + mt->rm[0].rs = r - 1; + } + mt->info = mt->rm[0].qs; + mt->info = mt->info << 32 | mt->rm[0].qe; + mt->x[2] = 1; +} + // 利用fmt搜索seed,完整搜索,只需要单向搜索 bwtintv_t fmt_search(FMTIndex *fmt, const string &q) { @@ -604,166 +714,6 @@ bwtintv_t fmt_search(FMTIndex *fmt, const string &q) return ik; } -// 获取kmer的fmt匹配信息 -inline void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos) -{ - bwtint_t x0, x1, x2; - int byte_idx = pos * 14; - uint8_t *arr = ke->intv_arr + byte_idx; - x0 = *arr; - x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x1 = *arr; - x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x2 = *((uint32_t *)arr); - ok->x[0] = x0; - ok->x[1] = x1; - ok->x[2] = x2; -} - -// 设置kmer第pos个碱基对应的fmt匹配信息 -inline void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos) -{ - int byte_idx = pos * 14; - uint8_t *arr = ke->intv_arr + byte_idx; - arr[0] = (uint8_t)(ik.x[0] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; - arr += 5; - arr[0] = (uint8_t)(ik.x[1] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; - arr += 5; - *((uint32_t *)arr) = (uint32_t)ik.x[2]; -} - -inline void full_entry_set(FullEntry *fe, bwtintv_t ik) -{ - uint8_t *arr = fe->intv_arr; - arr[0] = (uint8_t)(ik.x[0] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; - arr += 5; - arr[0] = (uint8_t)(ik.x[1] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; - arr += 5; - *((uint32_t *)arr) = (uint32_t)ik.x[2]; -} - -inline void full_entry_get(FullEntry *fe, bwtintv_t *ok) -{ - bwtint_t x0, x1, x2; - uint8_t *arr = fe->intv_arr; - x0 = *arr; - x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x1 = *arr; - x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x2 = *((uint32_t *)arr); - ok->x[0] = x0; - ok->x[1] = x1; - ok->x[2] = x2; -} - -// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息 -void fmt_search_store_kmer(FMTIndex *fmt, const string &q, KmerEntry &ke) -{ - bwtintv_t ik; - int i, c1, c2; - int qlen = (int)q.size(); - bwtint_t tk[4], tl[4]; - - fmt_set_intv(fmt, bval(q[0]), ik); - kmer_setval_at(&ke, ik, 0); - - // 每次扩展两个碱基 - for (i = 1; i + 1 < qlen; i += 2) - { - if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) - { - c1 = cbval(q[i]); - c2 = cbval(q[i + 1]); - - fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); - fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); - // 第一次扩展的结果 - ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; - ik.x[1] = fmt->L2[c1] + 1 + tk[1]; - ik.x[2] = tl[1] - tk[1]; - kmer_setval_at(&ke, ik, i); - - // 第二次扩展的结果 - ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2]; - ik.x[1] = fmt->L2[c2] + 1 + tk[3]; - ik.x[2] = tl[3] - tk[3]; - kmer_setval_at(&ke, ik, i + 1); - } - else - { - break; - } - } - if (i < qlen && bval(q[i]) < 4) - { // 最后一次扩展 - c1 = cbval(q[i]); - c2 = 3; - fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk); - fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl); - // 第一次扩展的结果 - ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; - ik.x[1] = fmt->L2[c1] + 1 + tk[1]; - ik.x[2] = tl[1] - tk[1]; - kmer_setval_at(&ke, ik, i); - } -} - -// 利用fmt搜索seed,利用kmer加速搜索过程 -bwtintv_t fmt_search_use_kmer(FMTIndex *fmt, const string &q) -{ - bwtintv_t ik; - bwtintv_t ok; - int i, c1, c2, x = 0; - int qlen = (int)q.size(); - - // 先利用kmer进行索引查询 - uint32_t qbit = 0; - x = min(KMER_LEN, qlen); - //double tt = realtime(); - for (i = 0; i < x; ++i) - { - qbit |= bval(q[i]) << ((KMER_LEN - 1 - i) << 1); - } - kmer_getval_at(&fmt->kmer_entry[qbit], &ik, x - 1); - //t1 += realtime() - tt; - // tt = realtime(); - // 每次扩展两个碱基 - for (i = x; i + 1 < qlen; i += 2) - { - if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) - { - c1 = cbval(q[i]); - c2 = cbval(q[i + 1]); - fmt_extend2(fmt, &ik, &ok, 0, c1, c2); - ik = ok; - ik.info = i + 1; - } - else - { - break; - } - } - if (i < qlen && bval(q[i]) < 4) - { // 最后一次扩展 - c1 = cbval(q[i]); - fmt_extend1(fmt, &ik, &ok, 0, c1); - //fmt_extend2(fmt, &ik, &ok, 0, c1, 3); - ik = ok; - ik.info = i + 1; - } - //t2 += realtime() - tt; - //cout << ik.x[2] << endl; - return ik; -} - // 将用字符串表示的序列计算成用2bit表示的序列 uint32_t str2bit(string &str) { @@ -922,7 +872,7 @@ void create_xmer_index(FMTIndex *fmt) } // 利用fmt搜索seed,利用xmer加速搜索过程 -bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) +bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q, int cmp_de) { bwtintv_t ik; int i, c1, c2, x = 0; @@ -957,7 +907,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) { xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1); } -#if 0 +#if 1 bwtintv_t ok; // 每次扩展两个碱基 for (i = x; i + 1 < qlen; i += 2) @@ -982,7 +932,7 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) ik = ok; ik.info = i + 1; } -#else +#else // 测试一下bwt的性能 bwtintv_t ok[4]; for (i = x; i < qlen; ++i) { if (bval(q[i]) < 4) { @@ -994,90 +944,117 @@ bwtintv_t fmt_search_use_xmer(FMTIndex *fmt, bwt_t *bwt, const string &q) } #endif - + if (cmp_de) { + int64_t rp = fmt_sa(fmt, ik.x[0]); + rp = rp >= (int64_t)fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; + ik.rm[0].rs = (uint32_t)rp; + } return ik; } -uint16_t *restore_kmer_bit(const char *fn, int kmer_len) +// 利用fmt搜索seed,利用xmer, direct_extend加速搜索过程 +bwtintv_t fmt_search_use_xmer_de(FMTIndex *fmt, const string &q) { - uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); - FILE *fp; - fp = xopen(fn, "rb"); - fread_fix(fp, (1 << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit); - err_fclose(fp); - return kbit; -} + bwtintv_t ik = {0}; + int i, c1, c2, x = 0; + int qlen = (int)q.size(); + bwtintv_t mt = {0}; -void create_kmer_bit(FMTIndex *fmt, string &prefix, uint64_t kmer_len) -{ - uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); - ostringstream oss; - oss << '.' << kmer_len << ".kmer.bit"; - string kmer_bit = prefix + oss.str(); - cout << kmer_bit << endl; - uint64_t seq_up_val = (1L << (kmer_len << 1)); - cout << (1L << ((kmer_len << 1) - 4)) << '\t' << seq_up_val << endl; - string seq; - seq.resize(kmer_len); - int percent = 1; - double tt = realtime(); - for (uint64_t i = 0; i < seq_up_val; ++i) + int max_xmer_len = 14; + // 先利用kmer进行索引查询 + uint32_t qbit = 0; + x = min(max_xmer_len, qlen); + for (i = 0; i < x; ++i) { - if ((i+1) % (seq_up_val / 100) == 0) - { - tt = realtime() - tt; - cout << percent++ << "\% finished, " << tt << "s elapsed." << endl; - tt = realtime(); - } - for (int j = kmer_len - 1; j >= 0; --j) - { - seq[kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; - } - uint32_t pos = i >> 4; - uint32_t bit_pos = i & 0xf; - //bwtintv_t p = fmt_search_use_xmer(fmt, seq); - //if (p.x[2] > 0) { - // kbit[pos] |= 1 << bit_pos; - //} + qbit |= bval(q[i]) << ((max_xmer_len - 1 - i) << 1); } - FILE *fp; - fp = xopen(kmer_bit.c_str(), "wb"); - err_fwrite(kbit, 2, 1L << ((kmer_len << 1) - 4), fp); - err_fflush(fp); - err_fclose(fp); + if (x == 14) + { + xmer_getval_at(fmt->xmer_hash.xe14[qbit].intv_arr, &ik); + } + else if (x == 13) + { + xmer_getval_at(fmt->xmer_hash.xe13[qbit >> 2].intv_arr, &ik); + } + else if (x == 12) + { + // cout << "qbit: " << qbit << endl; + xmer_getval_at(fmt->xmer_hash.xe12[qbit >> 4].intv_arr, &ik); + } + else if (x == 11) + { + xmer_getval_at(fmt->xmer_hash.xe11[qbit >> 6].intv_arr, &ik); + } + else + { + xmer_getval_at(fmt->xmer_hash.xe10[qbit >> 8].intv_arr, &ik, x - 1); + } + + bwtintv_t ok = {0}; + // 每次扩展两个碱基 + for (i = x; i + 1 < qlen; i += 2) + { + if (bval(q[i]) < 4 && bval(q[i + 1]) < 4) + { + c1 = cbval(q[i]); + c2 = cbval(q[i + 1]); + fmt_extend2(fmt, &ik, &ok, 0, c1, c2); + if (ok.x[2] == 1) { + direct_extend(fmt, qlen, q.c_str(), 0, i + 2, ok.x[0], &mt); + return mt; + } + ik = ok; + ik.info = i + 1; + } + else + { + break; + } + } + if (i < qlen && bval(q[i]) < 4) + { // 最后一次扩展 + c1 = cbval(q[i]); + fmt_extend1(fmt, &ik, &ok, 0, c1); + ik = ok; + ik.info = i + 1; + } + + return ik; } #define GEN_BWT_IDX 0 -#define GEN_FMT_IDX 0 +#define GEN_FMT_IDX 1 #define GEN_XMER_IDX 0 #define CMP_FMT_BWT_TIME 1 -#define CMP_FMT_BWT_RESULT 0 +#define CMP_FMT_BWT_RESULT 1 // argv[1] 应该是索引的前缀 int main_fmtidx(int argc, char **argv) { - //cout << "size: " << sizeof(FullEntry) << '\t' << sizeof(XmerEntryArr) << endl; string prefix = argv[1]; string bwt_str = prefix + ".bwt.str"; - string bwt_idx, fmt_idx, xmer_idx; + string bwt_idx, fmt_idx, xmer_idx, sa_fn, seq_fn; bwt_t *bwt; FMTIndex *fmt; - ostringstream oss_bwt, oss_fmt, oss_xmer; + ostringstream oss_bwt, oss_fmt, oss_xmer, oss_sa; oss_bwt << '.' << OCC_INTERVAL; oss_fmt << '.' << FMT_OCC_INTERVAL; oss_xmer << '.' << XMER_LEN; oss_fmt << '.' << FMT_MID_INTERVAL; + oss_sa << ".33." << SA_INTV; bwt_idx = prefix + oss_bwt.str() + ".bwt"; fmt_idx = prefix + oss_fmt.str() + ".fmt"; xmer_idx = prefix + ".14.kmer"; + sa_fn = prefix + oss_sa.str() + ".sa"; + seq_fn = "./seqs.txt"; cout << bwt_idx << endl; cout << fmt_idx << endl; cout << xmer_idx << endl; + cout << sa_fn << endl; - -// 生成或读取bwt索引文件 + // 生成或读取bwt索引文件 double time_read_bwt = realtime(); #if GEN_BWT_IDX bwt = restore_bwt(bwt_str.c_str()); @@ -1100,7 +1077,7 @@ int main_fmtidx(int argc, char **argv) time_read_fmt = realtime() - time_read_fmt; cout << "[time gen/read fmt:] " << time_read_fmt << "s" << endl; -// 生成xmer索引文件 +// 生成或读取xmer索引文件 double time_gen_xmer_hash = realtime(); #if GEN_XMER_IDX create_xmer_index(fmt); @@ -1111,80 +1088,119 @@ int main_fmtidx(int argc, char **argv) time_gen_xmer_hash = realtime() - time_gen_xmer_hash; cout << "[time gen xkmer hash:] " << time_gen_xmer_hash << "s" << endl; - int kmer_bit_len = 18; - //create_kmer_bit(fmt, prefix, kmer_bit_len); - // return 0; +// 读取sa + double time_load_sa = realtime(); + fmt_restore_sa(sa_fn.c_str(), fmt); + time_load_sa = realtime() - time_load_sa; + cout << "[time load sa:] " << time_load_sa << "s" << endl; + +// 读取pac + double time_load_pac = realtime(); + fmt_load_ref_pac(prefix.c_str(), fmt); + time_load_pac = realtime() - time_load_pac; + cout << "[time load pac:] " << time_load_pac << "s" << endl; + + //cout << "l_pac: " << fmt->l_pac << endl; + //return 0; // 生成随机序列进行测试 int seq_size = 10000000; //int seq_size = 1; - int seq_len = kmer_bit_len; + int seq_len = 60; vector seq_arr(seq_size); - seq_arr[0] = "CACATATTGGCG"; + //seq_arr[0] = "CACATATTGGCG"; // seq_arr[0] = "GAGAGCTGTTCCCGTGTTTTCCATGGTTT"; // seq_arr[0] = "ACCTTTCGAATTGA"; -#if 1 +#if 0 // 随机生成seqs srand(time(NULL)); double time_gen_seq = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) seq_arr[i] = generate_rand_seq(seq_len); time_gen_seq = realtime() - time_gen_seq; - cout << "[time gen seq:] " << time_gen_seq << "s" << endl; + cout << "[time gen seq:] " << time_gen_seq << "s" << endl; +#else + ifstream seq_in(seq_fn.c_str(), ios::in); + string read_line; + seq_arr.resize(0); + while (!seq_in.eof()) + { + seq_in >> read_line; + // cout << read_line.substr(0, seq_len) << endl; + seq_arr.push_back(read_line.substr(0, seq_len)); + } + seq_size = seq_arr.size(); + seq_in.close(); #endif -// 对比bwt和fmt搜索的时间 + // 对比bwt和fmt搜索的时间 #if CMP_FMT_BWT_TIME - //int nintv_0 = 0; - //bwtint_t max_gap = 0; + // bwt double time_bwt_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) { - bwtintv_t p = bwt_search(bwt, seq_arr[i]); - //if (max_gap < p.x[2]) - // max_gap = p.x[2]; - //if (p.x[2] == 0) - // ++nintv_0; + // bwtintv_t p = + bwt_search(bwt, seq_arr[i]); } - //cout << "all: " << seq_arr.size() << " zero: " << nintv_0 << endl; - //cout << "max gap: " << max_gap << endl; time_bwt_search = realtime() - time_bwt_search; cout << "[time bwt search:] " << time_bwt_search << "s" << endl; + // fmt double time_fmt_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) fmt_search(fmt, seq_arr[i]); time_fmt_search = realtime() - time_fmt_search; cout << "[time fmt search:] " << time_fmt_search << "s" << endl; + // kmer double time_xmer_search = realtime(); for (int i = 0; i < (int)seq_arr.size(); ++i) - fmt_search_use_xmer(fmt, bwt, seq_arr[i]); + fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0); time_xmer_search = realtime() - time_xmer_search; cout << "[time xmer search:] " << time_xmer_search << "s" << endl; -#endif - // cout << "t1: " << t1 << "s; t2: " << t2 << "s" << endl; - // cout << "t3: " << t3 << "s; t4: " << t4 << "s" << endl; - // cout << "t5: " << t5 << "s; t4: " << t6 << "s" << endl; -// ostringstream oss_kmer_bit; -// oss_kmer_bit << prefix << "." << kmer_bit_len << ".kmer.bit"; -// uint16_t *kbit = restore_kmer_bit(oss_kmer_bit.str().c_str(), kmer_bit_len); + // direct-extend + double time_de_search = realtime(); + for (int i = 0; i < (int)seq_arr.size(); ++i) + fmt_search_use_xmer_de(fmt, seq_arr[i]); + time_de_search = realtime() - time_de_search; + cout << "[time direct-extend search:] " << time_de_search << "s" << endl; +#endif // 对比bwt和fmt搜索的结果 #if CMP_FMT_BWT_RESULT - double time_cmp = realtime(); + double time_cmp = realtime(); + long diff_num = 0; for (int i = 0; i < (int)seq_arr.size(); ++i) { +#if 0 bwtintv_t p1 = bwt_search2(bwt, seq_arr[i]); // bwtintv_t p2 = fmt_search(fmt, seq_arr[i]); - bwtintv_t p2 = fmt_search_use_xmer(fmt, seq_arr[i]); - if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) - cout << seq_arr[i] << endl + bwtintv_t p2 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 0); + if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) { + cout << "different: " << seq_arr[i] << endl << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; + diff_num++; + } +#else + bwtintv_t p1 = fmt_search_use_xmer(fmt, bwt, seq_arr[i], 1); + bwtintv_t p2 = fmt_search_use_xmer_de(fmt, seq_arr[i]); + if (p2.num_match == 1) { + if (p1.rm[0].rs != p2.rm[0].rs || p1.x[2] != 1) { + cout << "different: " << seq_arr[i] << '\t' << p1.rm[0].rs << '\t' << p2.rm[0].rs << endl; + diff_num++; + } + } + else if (p1.x[0] != p2.x[0] || p1.x[1] != p2.x[1] || p1.x[2] != p2.x[2]) { + cout << "different: " << seq_arr[i] << endl + << p1.x[0] << ' ' << p1.x[1] << ' ' << p1.x[2] << endl + << p2.x[0] << ' ' << p2.x[1] << ' ' << p2.x[2] << endl; + diff_num++; + } +#endif } time_cmp = realtime() - time_cmp; - cout << "[time compare bwt & fmt:] " << time_cmp << "s" << endl; + cout << "[time compare bwt & fmt:] " << time_cmp << "s\tdifferent num: " << diff_num << endl; #endif return 0; diff --git a/fmt_index.h b/fmt_index.h index a17e041..08df96b 100644 --- a/fmt_index.h +++ b/fmt_index.h @@ -22,7 +22,8 @@ #elif FMT_MID_INTERVAL == 32 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) #elif FMT_MID_INTERVAL == 64 -#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) +#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6)) #elif FMT_MID_INTERVAL == 128 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24)) #else @@ -38,26 +39,12 @@ #define __fmt_mid_sum(x) \ ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) -#define KMER_LEN 12 -#define KMER_ARR_SIZE (1 << (KMER_LEN << 1)) #define XMER_LEN 14 -// 用来保存kmer对应的fmt的位置信息 -struct KmerEntry -{ - // 40+40+32 14个byte,这样好处理 - uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 -}; - -#define FULL_ENTRY_LEN 14 -#define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1)) -// kmer全部匹配对应的最后fmt匹配结果 -struct FullEntry -{ - uint8_t intv_arr[14]; -}; - +// sa存储的行间隔 +#define SA_INTV 4 +#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8) // x-mer entry, 1 - 14个碱基组成的hash key struct XmerEntry @@ -93,9 +80,10 @@ struct FMTIndex uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 uint8_t last_base; // dollar转换成的base + // ref pac相关 + bwtint_t l_pac; // 参考序列长度 + uint8_t *pac; // 保存2bit编码的参考序列 // 保存kmer对应的fmt位置信息 - KmerEntry *kmer_entry; - FullEntry *full_entry; XmerHash xmer_hash; // suffix array int sa_intv; diff --git a/util.h b/util.h index aa608d6..883efd5 100644 --- a/util.h +++ b/util.h @@ -17,10 +17,20 @@ typedef uint64_t bwtint_t; double realtime(void); +typedef struct +{ + int qs; // query start + int qe; // query end [) + int reverse; // 反向链 + uint32_t rs; // 参考基因的起始点(包含) +} ref_match_t; + // 在fm-indexv(或者bwt)查找过程中,记录结果 struct bwtintv_t { bwtint_t x[3], info; // x[0]表示正链位置,x[1]表示互补链位置,x[2]表示间隔长度,info 表示read的起始结束位置 + int num_match; + ref_match_t rm[2]; }; // 读取单个数据