bwa_perf/fmt_index.h

94 lines
3.6 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#define FMT_OCC_INTV_SHIFT 8
#define FMT_OCC_INTERVAL (1LL << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 6
#define FMT_MID_INTERVAL (1LL << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
// #undef FMT_MID_INTERVAL
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#if FMT_MID_INTERVAL == 8
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
#elif FMT_MID_INTERVAL == 16
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
#elif FMT_MID_INTERVAL == 32
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#elif FMT_MID_INTERVAL == 64
//#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#define fmt_occ_intv(b, k) ((b)->bwt + ((k) >> 8 << 6))
#elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
#endif
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量
#define __fmt_occ_e2_aux4(fmt, b, val) \
((fmt)->cnt_table[(b)][(val) & 0xff] + (fmt)->cnt_table[b][(val) >> 8 & 0xff] + (fmt)->cnt_table[b][(val) >> 16 & 0xff] + (fmt)->cnt_table[b][(val) >> 24])
#define __fmt_occ_e2_aux2(fmt, b, val) \
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
#define XMER_LEN 14
// sa存储的行间隔
#define SA_INTV 4
#define SA_BYTES(n_sa) ((((33 * (n_sa) + 7) / 8) & (~7L)) + 8)
// x-mer entry, 1 - 14个碱基组成的hash key
struct XmerEntry
{
uint8_t intv_arr[14];
};
struct XmerEntryArr
{
uint8_t intv_arr[140];
};
struct XmerHash
{
XmerEntryArr *xe10;
XmerEntry *xe11;
XmerEntry *xe12;
XmerEntry *xe13;
XmerEntry *xe14;
};
// fm-index, extend twice in one search step (one memory access)
struct FMTIndex
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_occ[16][256]; // 前16-24位表示b碱基的occ8-16位表示大于b的occ0-8表示大于a的occba格式
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// ref pac相关
bwtint_t l_pac; // 参考序列长度
uint8_t *pac; // 保存2bit编码的参考序列
// 保存kmer对应的fmt位置信息
XmerHash xmer_hash;
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
};
#endif