fast-bwa/fmt_idx.h

116 lines
4.9 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
Description: 通过fmt-idx数据结构对seed过程进行加速 (fm-index twice extend in one search step)
Copyright : All right reserved by ICT
Author : Zhang Zhonghai
Date : 2023/12/24
*/
#ifndef FMT_INDEX_H_
#define FMT_INDEX_H_
#include <stdint.h>
#include <stddef.h>
#include "bwt.h"
#define FMT_OCC_INTV_SHIFT 8
#define FMT_OCC_INTERVAL (1 << FMT_OCC_INTV_SHIFT)
#define FMT_OCC_INTV_MASK (FMT_OCC_INTERVAL - 1)
#define FMT_MID_INTV_SHIFT 5
#define FMT_MID_INTERVAL (1 << FMT_MID_INTV_SHIFT)
#define FMT_MID_INTV_MASK (FMT_MID_INTERVAL - 1)
// #undef FMT_MID_INTERVAL
// 获取碱基c待查找序列的首个碱基和对应的互补碱基对应的行以及间隔
#define fmt_set_intv(fmt, c, ik) ((ik).x[0] = (fmt)->L2[(int)(c)] + 1, (ik).x[2] = (fmt)->L2[(int)(c) + 1] - (fmt)->L2[(int)(c)], (ik).x[1] = (fmt)->L2[3 - (c)] + 1, (ik).info = 0)
// k行bwt str行不包含$对应的check point occ数据起始地址小于k且是OCC_INTERVAL的整数倍
#if FMT_MID_INTERVAL == 8
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 144))
#elif FMT_MID_INTERVAL == 16
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 80))
#elif FMT_MID_INTERVAL == 32
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 48))
#elif FMT_MID_INTERVAL == 64
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 32))
#elif FMT_MID_INTERVAL == 128
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 24))
#else
#define fmt_occ_intv(b, k) ((b)->bwt + (k) / FMT_OCC_INTERVAL * (FMT_OCC_INTERVAL / 8 + 20))
#endif
// 字节val中包含bwt base为b的pre-bwt中T G C A按顺序保存在32位整数里每个占8bit的数量下边应该是计算扩展的两个碱基的occ和大于碱基的occ
#define __fmt_occ_e2_aux2(fmt, b, val) \
((fmt)->cnt_occ[(b)][(val) & 0xff] + (fmt)->cnt_occ[b][(val) >> 8 & 0xff] + (fmt)->cnt_occ[b][(val) >> 16 & 0xff] + (fmt)->cnt_occ[b][(val) >> 24])
#define __fmt_mid_sum(x) \
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
// sa存储的行间隔
#define SA_INTV 2
#define KMER_LEN 12
#define KMER_ARR_SIZE ((1 << (KMER_LEN << 1)))
// 用来保存kmer对应的fmt的位置信息
typedef struct
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14 * KMER_LEN]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
// fm-index, extend twice in one search step (one memory access)
typedef struct
{
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t sec_primary; // second primary line
bwtint_t L2[5]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of bwt, about seq_len/4
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_occ[16][256]; // 前16-24位表示b碱基的occ8-16位表示大于b的occ0-8表示大于a的occba格式
uint8_t sec_bcp; // base couple for sec primary line, AA=>0, AC=>1 ... TT=>15
uint8_t first_base; // 序列的第一个碱基2bit的int类型0,1,2,3
uint8_t last_base; // dollar转换成的base
// 保存kmer对应的fmt位置信息
KmerEntry *kmer_entry;
// suffix array
int sa_intv;
bwtint_t n_sa;
uint8_t *sa;
} FMTIndex;
// 将fmt结构数据写入到二进制文件
void dump_fmt(const char *fn, const FMTIndex *fmt);
// 从文件中读取fmt结构数据
FMTIndex *fmt_restore_fmt(const char *fn);
// 将kmer hash数据写入到文件
void dump_kmer_idx(const char *fn, const KmerEntry *kmer_entry);
// 从文件中读取kmer hash信息
KmerEntry *fmt_restore_kmer_idx(const char *fn);
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// 根据interval-bwt创建fmt-index
FMTIndex *create_fmt_from_bwt(bwt_t *bwt);
// 扩展两个个碱基计算bwt base为b的pre-bwt str中各个碱基的occ
void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[4]);
// 扩展两个碱基
void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2);
// 扩展一个碱基
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
// 查找并保存kmer中每扩展一个碱基对应的fmt位置信息
void fmt_search_store_kmer(FMTIndex *fmt, const char *q, int qlen, KmerEntry *ke);
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len);
// 设置kmer第pos个碱基对应的fmt匹配信息
void kmer_setval_at(KmerEntry *ke, bwtintv_t ik, int pos);
// 获取kmer的fmt匹配信息
void kmer_getval_at(KmerEntry *ke, bwtintv_t *ok, int pos);
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
#endif