#pragma once #include #include #include #include #include "debug.h" #include "kvec.h" #include "utils.h" /////////////////////////////////////////// // 宏定义 // 数据 #define HYB_KMER_LEN 16 #define HYB_KMER_MASK 4294967295ULL #define HYB_KMER_DATA_BYTES 5 // 5bytes #define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes #define HYB_KMER_DATA_TYPE_BITS 5 #define HYB_KMER_DATA_TYPE_MASK 31 #define HYB_HIT_THRESH 30 #define HYB_NODE_SA_MASK 1099511627775ULL #define HYB_MAX_SEQ_LEN 301 #define HYB_LEAF_NODE 7 // 节点类型 #define HYB_BP_1 0 #define HYB_BP_2 1 #define HYB_BP_3 2 #define HYB_BP_PATH 3 static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码 static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码 static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码 // 函数 #define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK) #define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)}; #define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos)) /////////////////////////////////////// // 结构体 // hybrid-index数据结构 typedef struct { uint8_t* kmer_data; // kmer数据 uint8_t* index_data; // 二级索引数据 uint8_t* sa; // 后缀数组 uint8_t* ref_bits; // 2-bit编码的ref序列 uint64_t ref_len; // ref长度 } HybridIndex; // Read序列需要的数据 typedef struct { int len; // read长度 uint8_t* seq; // 每个碱基占一个字节(0-A, 1-C, 2-G, 3-G, 4-N) uint8_t* rseq; // 反向互补序列 uint8_t* for_bits; // 正向序列 2-bit编码 uint8_t* back_bits; // 反向互补序列 2-bit编码 int id; // for test; char* seqstr; } ReadSeq; typedef kvec_t(ReadSeq) ReadSeqArr; // seeding过程生成的SMEM typedef struct { int first_len; int seed_start; // 左闭右开区间 int seed_end; int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos) int read_end; uint64_v ref_pos_arr; } HybSeed; // smem数组 typedef kvec_t(HybSeed) HybSeedArr; // read区间,用来处理包含N的read // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos) typedef struct { int start; int end; int len; } Range; #define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;} #define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end) #define _range_cross(r1, r2) \ (((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end)) // read区间数组 typedef kvec_t(Range) RangeArr; #define __check_add_seed(seed) \ HybSeed* seed = kv_pushp(HybSeed, *seeds); \ if (seeds->m > *seeds_cap) { \ uint64_t i = *seeds_cap; \ for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \ *seeds_cap = seeds->m; \ } \ seed->ref_pos_arr.n = 0; #define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \ __check_add_seed(seed_var); \ kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \ seed_var->first_len = 0; \ seed_var->seed_start = s_start; \ seed_var->seed_end = s_end; \ seed_var->read_start = read_range->start; \ seed_var->read_end = read_range->end; #define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \ if ((s_end) - (s_start) >= min_seed_len) { \ __add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \ } #define __copy_seed(src, dst) \ do { \ (dst).seed_start = (src).seed_start; \ (dst).seed_end = (src).seed_end; \ (dst).read_start = (src).read_start; \ (dst).read_end = (src).read_end; \ int i; \ for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \ } while (0) /////////////////////////////////////////////////////// // functions // 加载hybrid index HybridIndex* load_hybrid_idx(const char* prefix); // 创建正向反向互补bits void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re); // 用于将seq和ref正向直接比较,返回匹配的长度 int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len); // 用于将seq和ref反向直接比较,返回匹配的长度 int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos); // 根据sa的行获取对应的ref position(小端模式) uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row); // 解析第一个节点, 返回后续对应的节点地址 uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid); // 解析后续的正常节点 uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid); void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits, int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid); void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp, int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid); void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, uint8_t* cmp_ref, int tid); void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset); void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match); void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match); void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match); // 用hybrid-index来寻找smem(seeding-1 & seeding-2) void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, HybSeedArr* seeds, int tid); int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start, int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end, const int min_seed_len, HybSeedArr* seeds, int tid); // for seeding-3 void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range, const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);