#ifndef FMT_INDEX_H_ #define FMT_INDEX_H_ #define FMT_OCC_INTV_SHIFT 8 #define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT) #define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1) #define FMT_MID_INTV_SHIFT 6 #define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT) #define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1) // #undef FMT_MID_INTERVAL // 获取碱基c(待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔 #define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0) // k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍) #if FMT_MID_INTERVAL == 8 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144)) #elif FMT_MID_INTERVAL == 16 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80)) #elif FMT_MID_INTERVAL == 32 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48)) #elif FMT_MID_INTERVAL == 64 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32)) #elif FMT_MID_INTERVAL == 128 #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24)) #else #define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20)) #endif // 字节val中包含bwt base为b的pre-bwt中T G C A(按顺序保存在32位整数里(每个占8bit))的数量 #define __fmt_occ_e2_aux4(fmt, b, val) \ ((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24]) #define __fmt_occ_e2_aux2(fmt, b, val) \ ((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24]) #define __fmt_mid_sum(x) \ ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) #define KMER_LEN 12 #define KMER_ARR_SIZE (1 << (KMER_LEN << 1)) #define XMER_LEN 14 // 用来保存kmer对应的fmt的位置信息 struct KmerEntry { // 40+40+32 14个byte,这样好处理 uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 }; #define FULL_ENTRY_LEN 14 #define FULL_ENTRY_ARR_SIZE (1 << (FULL_ENTRY_LEN << 1)) // kmer全部匹配对应的最后fmt匹配结果 struct FullEntry { uint8_t intv_arr[14]; }; // x-mer entry, 1 - 14个碱基组成的hash key struct XmerEntry { uint8_t intv_arr[14]; }; struct XmerEntryArr { uint8_t intv_arr[140]; }; struct XmerHash { XmerEntryArr *xe10; XmerEntry *xe11; XmerEntry *xe12; XmerEntry *xe13; XmerEntry *xe14; }; // fm-index, extend twice in one search step (one memory access) struct FMTIndex { bwtint_t primary; // S^{-1}(0), or the primary index of BWT bwtint_t sec_primary; // second primary line bwtint_t L2[5]; // C(), cumulative count bwtint_t seq_len; // sequence length bwtint_t bwt_size; // size of bwt, about seq_len/4 uint32_t *bwt; // BWT // occurance array, separated to two parts uint32_t cnt_occ[16][256]; // 前16-24位表示b(碱基)的occ,8-16位表示大于b的occ,0-8表示大于a的occ,ba格式 uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15 uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3 uint8_t last_base; // dollar转换成的base // 保存kmer对应的fmt位置信息 KmerEntry *kmer_entry; FullEntry *full_entry; XmerHash xmer_hash; // suffix array int sa_intv; bwtint_t n_sa; uint8_t *sa; }; #endif