bwa_perf/bwt.h

68 lines
3.2 KiB
C
Raw Normal View History

#ifndef BWT_H_
#define BWT_H_
#include <stdint.h>
#include <string>
#include "util.h"
using std::string;
// occ间隔对应的右移位数5表示间隔32行保存一次
#define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
// 从最原始的bwt碱基串里获取k行对应的碱基这里的bwt不包括occ check point数据
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3)
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
// 从构建完成的bwt包含occ check point获取k行不含$这里的k不输出bwt mtx的行是bwt字符串的行的碱基
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
// The following two lines are ONLY correct when OCC_INTERVAL==0x80
// #define bwt_bwt(b, k) ((b)->bwt[((k) >> 7 << 4) + sizeof(bwtint_t) + (((k) & 0x7f) >> 4)])
// #define bwt_occ_intv(b, k) ((b)->bwt + ((k) >> 7 << 4))
///* For general OCC_INTERVAL, the following is correct:
// k行对应的bwt str碱基对应的check point bwt str数据起始地址
#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4) + 4 + (k) % OCC_INTERVAL / 16])
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4))
// 字节b中包含的T G C A按顺序保存在32位整数里每个占8bit的数量
#define __occ_aux4(bwt, b) \
((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24])
// 原始fm-index (bwt)结构
struct bwt_t
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
};
// k行(包含bwt mtx行)之前碱基c的累计总数, interval大于等于32才能正确计算
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
// 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
// 解析两bit的bwt碱基序列
bwt_t *restore_bwt(const char *fn);
// 根据原始的字符串bwt创建interval-bwt
void create_interval_occ_bwt(bwt_t *bwt);
void dump_bwt(const char *fn, const bwt_t *bwt);
// 利用bwt搜索seed完整搜索只需要单向搜索
void bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
#endif