94 lines
3.6 KiB
C
94 lines
3.6 KiB
C
#ifndef FMT_INDEX_H_
|
||
#define FMT_INDEX_H_
|
||
|
||
#define FMT_OCC_INTV_SHIFT 8
|
||
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
|
||
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
|
||
|
||
#define FMT_MID_INTV_SHIFT 6
|
||
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
|
||
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
|
||
|
||
// #undef FMT_MID_INTERVAL
|
||
|
||
// 获取碱基c(待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔
|
||
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
|
||
// k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍)
|
||
|
||
#if FMT_MID_INTERVAL == 8
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
|
||
#elif FMT_MID_INTERVAL == 16
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
|
||
#elif FMT_MID_INTERVAL == 32
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
|
||
#elif FMT_MID_INTERVAL == 64
|
||
//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6))
|
||
#elif FMT_MID_INTERVAL == 128
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
|
||
#else
|
||
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
|
||
#endif
|
||
// 字节val中包含bwt base为b的pre-bwt中T G C A(按顺序保存在32位整数里(每个占8bit))的数量
|
||
#define __fmt_occ_e2_aux4(fmt, b, val) \
|
||
((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
|
||
|
||
#define __fmt_occ_e2_aux2(fmt, b, val) \
|
||
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
|
||
|
||
#define __fmt_mid_sum(x) \
|
||
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
||
|
||
|
||
#define XMER_LEN 14
|
||
|
||
// sa存储的行间隔
|
||
#define SA_INTV 4
|
||
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
|
||
|
||
// x-mer entry, 1 - 14个碱基组成的hash key
|
||
struct XmerEntry
|
||
{
|
||
uint8_t intv_arr[14];
|
||
};
|
||
|
||
struct XmerEntryArr
|
||
{
|
||
uint8_t intv_arr[140];
|
||
};
|
||
|
||
struct XmerHash
|
||
{
|
||
XmerEntryArr *xe10;
|
||
XmerEntry *xe11;
|
||
XmerEntry *xe12;
|
||
XmerEntry *xe13;
|
||
XmerEntry *xe14;
|
||
};
|
||
|
||
// fm-index, extend twice in one search step (one memory access)
|
||
struct FMTIndex
|
||
{
|
||
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
||
bwtint_t sec_primary; // second primary line
|
||
bwtint_t L2[5]; // C(), cumulative count
|
||
bwtint_t seq_len; // sequence length
|
||
bwtint_t bwt_size; // size of bwt, about seq_len/4
|
||
uint32_t *bwt; // BWT
|
||
// occurance array, separated to two parts
|
||
uint32_t cnt_occ[16][256]; // 前16-24位表示b(碱基)的occ,8-16位表示大于b的occ,0-8表示大于a的occ,ba格式
|
||
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
|
||
uint8_t first_base; // 序列的第一个碱基2bit的int类型,0,1,2,3
|
||
uint8_t last_base; // dollar转换成的base
|
||
// ref pac相关
|
||
bwtint_t l_pac; // 参考序列长度
|
||
uint8_t *pac; // 保存2bit编码的参考序列
|
||
// 保存kmer对应的fmt位置信息
|
||
XmerHash xmer_hash;
|
||
// suffix array
|
||
int sa_intv;
|
||
bwtint_t n_sa;
|
||
uint8_t *sa;
|
||
};
|
||
|
||
#endif |