#ifndef BWT_H_ #define BWT_H_ #include #include #include "util.h" using std::string; // occ间隔对应的右移位数,5表示间隔32行保存一次 #define OCC_INTV_SHIFT 7 #define OCC_INTERVAL (1LL << OCC_INTV_SHIFT) #define OCC_INTV_MASK (OCC_INTERVAL - 1) // 从最原始的bwt碱基串里获取k行对应的碱基,这里的bwt不包括occ check point数据 #define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3) /* retrieve a character from the $-removed BWT string. Note that * bwt_t::bwt is not exactly the BWT string and therefore this macro is * called bwt_B0 instead of bwt_B */ // 从构建完成的bwt(包含occ check point)获取k行(不含$,这里的k不输出bwt mtx的行,是bwt字符串的行)的碱基 #define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3) // 获取碱基c(待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔 #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0) // The following two lines are ONLY correct when OCC_INTERVAL==0x80 // #define bwt_bwt(b, k) ((b)->bwt[((k) >> 7 << 4) + sizeof(bwtint_t) + (((k) & 0x7f) >> 4)]) // #define bwt_occ_intv(b, k) ((b)->bwt + ((k) >> 7 << 4)) ///* For general OCC_INTERVAL, the following is correct: // k行对应的bwt str碱基,对应的check point bwt str数据起始地址 #define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4) + 4 + (k) % OCC_INTERVAL / 16]) // k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍) #define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4)) // 字节b中包含的T G C A(按顺序保存在32位整数里(每个占8bit))的数量 #define __occ_aux4(bwt, b) \ ((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24]) // 原始fm-index (bwt)结构 struct bwt_t { bwtint_t primary; // S^{-1}(0), or the primary index of BWT bwtint_t L2[5]; // C(), cumulative count bwtint_t seq_len; // sequence length bwtint_t bwt_size; // size of bwt, about seq_len/4 uint32_t *bwt; // BWT // occurance array, separated to two parts uint32_t cnt_table[256]; // suffix array int sa_intv; bwtint_t n_sa; uint8_t *sa; }; // k行(包含,bwt mtx行)之前,碱基c的累计总数, interval大于等于32才能正确计算 bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c); // 统计k行(bwt mtx行,包含k行本身)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1 void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); // 解析两bit的bwt碱基序列 bwt_t *restore_bwt(const char *fn); // 根据原始的字符串bwt创建interval-bwt void create_interval_occ_bwt(bwt_t *bwt); void dump_bwt(const char *fn, const bwt_t *bwt); // 利用bwt搜索seed,完整搜索,只需要单向搜索 void bwt_search(bwt_t *bwt, const string &q); // 每次扩展两步 bwtintv_t bwt_search2(bwt_t *bwt, const string &q); #endif