fast-bwa/fmt_idx.h

142 lines
5.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: 通过fmt-idx数据结构对seed过程进行加速 (fm-index twice extend in one search step)
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/12/24
*/
#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#include <stdint.h>
#include <stddef.h>
#include "bwt.h"
#define FMT_OCC_INTV_SHIFT 8
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 6
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
// #undef FMT_MID_INTERVAL
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#if FMT_MID_INTERVAL == 8
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
#elif FMT_MID_INTERVAL == 16
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
#elif FMT_MID_INTERVAL == 32
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#elif FMT_MID_INTERVAL == 64
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
#endif
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量下边应该是计算扩展的两个碱基的occ和大于碱基的occ
#define __fmt_occ_e2_aux2(fmt, b, val) \
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
// sa存储的行间隔
#define SA_INTV 2
#define HASH_KMER_LEN 14
#define BIT_KMER_LEN 17
// 用来保存kmer对应的fmt的位置信息
typedef struct
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
typedef struct
{
uint8_t intv_arr[140]; // 保存长度为10的kmer每个碱基对应的bwt匹配信息
} KmerEntryArr;
// 保存各个位置对应的bwt匹配信息
typedef struct
{
KmerEntryArr *ke10;
KmerEntry *ke11;
KmerEntry *ke12;
KmerEntry *ke13;
KmerEntry *ke14;
} KmerHash;
// fm-index, extend twice in one search step (one memory access)
typedef struct
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_occ[16][256]; // 前16-24位表示b碱基的occ8-16位表示大于b的occ0-8表示大于a的occba格式
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// ref pac相关
bwtint_t l_pac; // 参考序列长度
uint8_t *pac; // 保存2bit编码的参考序列
// 保存kmer对应的fmt位置信息
KmerHash kmer_hash;
// kmer_bit好像效果一般
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
int bit_kmer_len;
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
} FMTIndex;
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt);
// 从文件中读取fmt结构数据
FMTIndex *fmt_restore_fmt(const char *fn);
// 将kmer hash数据写入到文件
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
// 从文件中读取kmer hash信息
KmerHash fmt_restore_kmer_idx(const char *fn);
// 读取kmer bit数据
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len);
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt);
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
// 扩展两个碱基
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
// 扩展一个碱基
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len);
// 设置kmer第pos个碱基对应的fmt匹配信息
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
// 获取kmer的fmt匹配信息
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k);
#endif