fast-bwa/fmt_idx.h

116 lines
4.9 KiB
C
Raw Normal View History

/*
Description: fmt-idxseed (fm-index twice extend in one search step)
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/12/24
*/
#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#include <stdint.h>
#include <stddef.h>
#include "bwt.h"
#define FMT_OCC_INTV_SHIFT 8
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 5
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
// #undef FMT_MID_INTERVAL
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#if FMT_MID_INTERVAL == 8
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
#elif FMT_MID_INTERVAL == 16
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
#elif FMT_MID_INTERVAL == 32
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#elif FMT_MID_INTERVAL == 64
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
#endif
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量下边应该是计算扩展的两个碱基的occ和大于碱基的occ
#define __fmt_occ_e2_aux2(fmt, b, val) \
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
// sa存储的行间隔
#define SA_INTV 2
#define KMER_LEN 12
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
// 用来保存kmer对应的fmt的位置信息
typedef struct
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
// fm-index, extend twice in one search step (one memory access)
typedef struct
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_occ[16][256]; // 前16-24位表示b碱基的occ8-16位表示大于b的occ0-8表示大于a的occba格式
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// 保存kmer对应的fmt位置信息
KmerEntry *kmer_entry;
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
} FMTIndex;
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt);
// 从文件中读取fmt结构数据
FMTIndex *fmt_restore_fmt(const char *fn);
// 将kmer hash数据写入到文件
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry);
// 从文件中读取kmer hash信息
KmerEntry *fmt_restore_kmer_idx(const char *fn);
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt);
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
// 扩展两个碱基
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
// 扩展一个碱基
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
void fmt_search_store_kmer(FMTIndex *fmt, const char *q, int qlen, KmerEntry *ke);
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len);
// 设置kmer第pos个碱基对应的fmt匹配信息
void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos);
// 获取kmer的fmt匹配信息
void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos);
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
#endif