hyb-align/hyb_idx.h

182 lines
8.7 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#pragma once
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include "debug.h"
#include "kvec.h"
#include "utils.h"
///////////////////////////////////////////
// 宏定义
// 数据
#define HYB_KMER_LEN 16
#define HYB_KMER_MASK 4294967295ULL
#define HYB_KMER_DATA_BYTES 5 // 5bytes
#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes
#define HYB_KMER_DATA_TYPE_BITS 5
#define HYB_KMER_DATA_TYPE_MASK 31
#define HYB_HIT_THRESH 30
#define HYB_NODE_SA_MASK 1099511627775ULL
#define HYB_MAX_SEQ_LEN 301
#define HYB_LEAF_NODE 7
// 节点类型
#define HYB_BP_1 0
#define HYB_BP_2 1
#define HYB_BP_3 2
#define HYB_BP_PATH 3
static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码
static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码
static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码
// 函数
#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK)
#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)};
#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos))
///////////////////////////////////////
// 结构体
// hybrid-index数据结构
typedef struct {
uint8_t* kmer_data; // kmer数据
uint8_t* index_data; // 二级索引数据
uint8_t* sa; // 后缀数组
uint8_t* ref_bits; // 2-bit编码的ref序列
uint64_t ref_len; // ref长度
} HybridIndex;
// Read序列需要的数据
typedef struct {
int len; // read长度
uint8_t* seq; // 每个碱基占一个字节0-A, 1-C, 2-G, 3-G, 4-N
uint8_t* rseq; // 反向互补序列
uint8_t* for_bits; // 正向序列 2-bit编码
uint8_t* back_bits; // 反向互补序列 2-bit编码
int id; // for test;
// char* seqstr;
} ReadSeq;
typedef kvec_t(ReadSeq) ReadSeqArr;
// seeding过程生成的SMEM
typedef struct {
int first_len; // 记录第一次匹配时的长度应该是用来减少seeding-3的计算的
int seed_start; // 左闭右开区间,种子开始位置,对应在序列上的位置
int seed_end;
int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
int read_end;
uint64_v ref_pos_arr;
} HybSeed;
// smem数组
typedef kvec_t(HybSeed) HybSeedArr;
// read区间用来处理包含N的read
// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
typedef struct {
int start;
int end;
int len;
} Range;
#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;}
#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end)
#define _range_cross(r1, r2) \
(((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end))
// read区间数组
typedef kvec_t(Range) RangeArr;
#define __check_add_seed(seed) \
HybSeed* seed = kv_pushp(HybSeed, *seeds); \
if (seeds->m > *seeds_cap) { \
uint64_t i = *seeds_cap; \
for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \
*seeds_cap = seeds->m; \
} \
seed->ref_pos_arr.n = 0;
#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
__check_add_seed(seed_var); \
kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \
seed_var->first_len = 0; \
seed_var->seed_start = s_start; \
seed_var->seed_end = s_end; \
seed_var->read_start = read_range->start; \
seed_var->read_end = read_range->end;
#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
if ((s_end) - (s_start) >= min_seed_len) { \
__add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \
}
#define __copy_seed(src, dst) \
do { \
(dst).seed_start = (src).seed_start; \
(dst).seed_end = (src).seed_end; \
(dst).read_start = (src).read_start; \
(dst).read_end = (src).read_end; \
int i; \
for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \
} while (0)
///////////////////////////////////////////////////////
// functions
// 加载hybrid index
// HybridIndex* load_hybrid_idx(const char* prefix);
// 创建正向反向互补bits
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re);
// 用于将seq和ref正向直接比较返回匹配的长度
int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len);
// 用于将seq和ref反向直接比较返回匹配的长度
int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos);
// 根据sa的行获取对应的ref position小端模式
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row);
// 解析第一个节点, 返回后续对应的节点地址
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
// 解析后续的正常节点
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid);
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset);
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match);
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match);
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match);
// 用hybrid-index来寻找smem(seeding-1 & seeding-2)
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid);
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
const int min_seed_len, HybSeedArr* seeds, int tid);
// for seeding-3
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);