hyb-align/hyb_idx.h

182 lines
8.7 KiB
C
Raw Permalink Normal View History

#pragma once
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include "debug.h"
#include "kvec.h"
#include "utils.h"
///////////////////////////////////////////
// 宏定义
// 数据
#define HYB_KMER_LEN 16
#define HYB_KMER_MASK 4294967295ULL
#define HYB_KMER_DATA_BYTES 5 // 5bytes
#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes
#define HYB_KMER_DATA_TYPE_BITS 5
#define HYB_KMER_DATA_TYPE_MASK 31
#define HYB_HIT_THRESH 30
#define HYB_NODE_SA_MASK 1099511627775ULL
#define HYB_MAX_SEQ_LEN 301
#define HYB_LEAF_NODE 7
// 节点类型
#define HYB_BP_1 0
#define HYB_BP_2 1
#define HYB_BP_3 2
#define HYB_BP_PATH 3
static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码
static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码
static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码
// 函数
#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK)
#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)};
#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos))
///////////////////////////////////////
// 结构体
// hybrid-index数据结构
typedef struct {
uint8_t* kmer_data; // kmer数据
uint8_t* index_data; // 二级索引数据
uint8_t* sa; // 后缀数组
uint8_t* ref_bits; // 2-bit编码的ref序列
uint64_t ref_len; // ref长度
} HybridIndex;
// Read序列需要的数据
typedef struct {
int len; // read长度
uint8_t* seq; // 每个碱基占一个字节0-A, 1-C, 2-G, 3-G, 4-N
uint8_t* rseq; // 反向互补序列
uint8_t* for_bits; // 正向序列 2-bit编码
uint8_t* back_bits; // 反向互补序列 2-bit编码
int id; // for test;
2025-11-16 01:37:21 +08:00
// char* seqstr;
} ReadSeq;
typedef kvec_t(ReadSeq) ReadSeqArr;
// seeding过程生成的SMEM
typedef struct {
int first_len; // 记录第一次匹配时的长度应该是用来减少seeding-3的计算的
int seed_start; // 左闭右开区间,种子开始位置,对应在序列上的位置
int seed_end;
int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
int read_end;
uint64_v ref_pos_arr;
} HybSeed;
// smem数组
typedef kvec_t(HybSeed) HybSeedArr;
// read区间用来处理包含N的read
// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
typedef struct {
int start;
int end;
int len;
} Range;
#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;}
#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end)
#define _range_cross(r1, r2) \
(((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end))
// read区间数组
typedef kvec_t(Range) RangeArr;
#define __check_add_seed(seed) \
HybSeed* seed = kv_pushp(HybSeed, *seeds); \
if (seeds->m > *seeds_cap) { \
uint64_t i = *seeds_cap; \
for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \
*seeds_cap = seeds->m; \
} \
seed->ref_pos_arr.n = 0;
#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
__check_add_seed(seed_var); \
kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \
seed_var->first_len = 0; \
seed_var->seed_start = s_start; \
seed_var->seed_end = s_end; \
seed_var->read_start = read_range->start; \
seed_var->read_end = read_range->end;
#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
if ((s_end) - (s_start) >= min_seed_len) { \
__add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \
}
#define __copy_seed(src, dst) \
do { \
(dst).seed_start = (src).seed_start; \
(dst).seed_end = (src).seed_end; \
(dst).read_start = (src).read_start; \
(dst).read_end = (src).read_end; \
int i; \
for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \
} while (0)
///////////////////////////////////////////////////////
// functions
// 加载hybrid index
// HybridIndex* load_hybrid_idx(const char* prefix);
// 创建正向反向互补bits
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re);
// 用于将seq和ref正向直接比较返回匹配的长度
int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len);
// 用于将seq和ref反向直接比较返回匹配的长度
int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos);
// 根据sa的行获取对应的ref position小端模式
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row);
// 解析第一个节点, 返回后续对应的节点地址
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
// 解析后续的正常节点
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid);
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset);
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match);
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match);
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match);
// 用hybrid-index来寻找smem(seeding-1 & seeding-2)
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid);
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
const int min_seed_len, HybSeedArr* seeds, int tid);
// for seeding-3
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);