72 lines
3.2 KiB
C++
72 lines
3.2 KiB
C++
#ifndef BWT_H_
|
||
#define BWT_H_
|
||
#include <stdint.h>
|
||
#include <string>
|
||
#include "util.h"
|
||
|
||
using std::string;
|
||
|
||
// occ间隔对应的右移位数,5表示间隔32行保存一次
|
||
#define OCC_INTV_SHIFT 7
|
||
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
|
||
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
||
|
||
// 从最原始的bwt碱基串里获取k行对应的碱基,这里的bwt不包括occ check point数据
|
||
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3)
|
||
|
||
/* retrieve a character from the $-removed BWT string. Note that
|
||
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
|
||
* called bwt_B0 instead of bwt_B */
|
||
// 从构建完成的bwt(包含occ check point)获取k行(不含$,这里的k不输出bwt mtx的行,是bwt字符串的行)的碱基
|
||
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
|
||
|
||
// 获取碱基c(待查找序列的首个碱基)和对应的互补碱基对应的行,以及间隔
|
||
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
|
||
|
||
// k行对应的bwt str碱基,对应的check point bwt str数据起始地址
|
||
#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4) + 4 + (k) % OCC_INTERVAL / 16])
|
||
// k行(bwt str行(不包含$))对应的check point occ数据起始地址(小于k且是OCC_INTERVAL的整数倍)
|
||
#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4))
|
||
// 字节b中包含的T G C A(按顺序保存在32位整数里(每个占8bit))的数量
|
||
#define __occ_aux4(bwt, b) \
|
||
((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24])
|
||
|
||
// 原始fm-index (bwt)结构
|
||
struct bwt_t
|
||
{
|
||
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
||
bwtint_t L2[5]; // C(), cumulative count
|
||
bwtint_t seq_len; // sequence length
|
||
bwtint_t bwt_size; // size of bwt, about seq_len/4
|
||
uint32_t *bwt; // BWT
|
||
// occurance array, separated to two parts
|
||
uint32_t cnt_table[256];
|
||
// suffix array
|
||
int sa_intv;
|
||
bwtint_t n_sa;
|
||
uint8_t *sa;
|
||
};
|
||
|
||
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
|
||
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
|
||
|
||
// k行(包含,bwt mtx行)之前,碱基c的累计总数, interval大于等于32才能正确计算
|
||
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
|
||
// 统计k行(bwt mtx行,包含k行本身)之前4种碱基累积数量,这里的k是bwt矩阵里的行,比bwt字符串多1
|
||
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
|
||
// 解析两bit的bwt碱基序列
|
||
bwt_t *restore_bwt(const char *fn);
|
||
// 根据原始的字符串bwt创建interval-bwt
|
||
void create_interval_occ_bwt(bwt_t *bwt);
|
||
void dump_bwt(const char *fn, const bwt_t *bwt);
|
||
// 利用bwt搜索seed,完整搜索,只需要单向搜索
|
||
bwtintv_t bwt_search(bwt_t *bwt, const string &q);
|
||
// 每次扩展两步
|
||
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
|
||
|
||
void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2);
|
||
|
||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
|
||
|
||
#endif
|