bwa_perf/bwt.h

72 lines
3.2 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#ifndef BWT_H_
#define BWT_H_
#include <stdint.h>
#include <string>
#include "util.h"
using std::string;
// occ间隔对应的右移位数5表示间隔32行保存一次
#define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL << OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
// 从最原始的bwt碱基串里获取k行对应的碱基这里的bwt不包括occ check point数据
#define bwt_B00(b, k) ((b)->bwt[(k) >> 4] >> ((~(k) & 0xf) << 1) & 3)
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
// 从构建完成的bwt包含occ check point获取k行不含$这里的k不输出bwt mtx的行是bwt字符串的行的碱基
#define bwt_B0(b, k) (bwt_bwt(b, k) >> ((~(k) & 0xf) << 1) & 3)
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)] + 1, (ik).x[2] = (bwt)->L2[(int)(c) + 1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行对应的bwt str碱基对应的check point bwt str数据起始地址
#define bwt_bwt(b, k) ((b)->bwt[(k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4) + 4 + (k) % OCC_INTERVAL / 16])
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#define bwt_occ_intv(b, k) ((b)->bwt + (k) / OCC_INTERVAL * (OCC_INTERVAL / 16 + 4))
// 字节b中包含的T G C A按顺序保存在32位整数里每个占8bit的数量
#define __occ_aux4(bwt, b) \
((bwt)->cnt_table[(b) & 0xff] + (bwt)->cnt_table[(b) >> 8 & 0xff] + (bwt)->cnt_table[(b) >> 16 & 0xff] + (bwt)->cnt_table[(b) >> 24])
// 原始fm-index (bwt)结构
struct bwt_t
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
};
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
// k行(包含bwt mtx行)之前碱基c的累计总数, interval大于等于32才能正确计算
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, uint8_t c);
// 统计k行bwt mtx行包含k行本身之前4种碱基累积数量这里的k是bwt矩阵里的行比bwt字符串多1
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
// 解析两bit的bwt碱基序列
bwt_t *restore_bwt(const char *fn);
// 根据原始的字符串bwt创建interval-bwt
void create_interval_occ_bwt(bwt_t *bwt);
void dump_bwt(const char *fn, const bwt_t *bwt);
// 利用bwt搜索seed完整搜索只需要单向搜索
bwtintv_t bwt_search(bwt_t *bwt, const string &q);
// 每次扩展两步
bwtintv_t bwt_search2(bwt_t *bwt, const string &q);
void bwt_extend2(const bwt_t *bwt, bwtintv_t *ik, bwtintv_t ok[4], int is_back, int c1, int c2);
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
#endif