182 lines
8.7 KiB
C
182 lines
8.7 KiB
C
#pragma once
|
||
|
||
#include <inttypes.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <sys/stat.h>
|
||
|
||
#include "debug.h"
|
||
#include "kvec.h"
|
||
#include "utils.h"
|
||
|
||
///////////////////////////////////////////
|
||
// 宏定义
|
||
// 数据
|
||
#define HYB_KMER_LEN 16
|
||
#define HYB_KMER_MASK 4294967295ULL
|
||
#define HYB_KMER_DATA_BYTES 5 // 5bytes
|
||
#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes
|
||
#define HYB_KMER_DATA_TYPE_BITS 5
|
||
#define HYB_KMER_DATA_TYPE_MASK 31
|
||
#define HYB_HIT_THRESH 30
|
||
#define HYB_NODE_SA_MASK 1099511627775ULL
|
||
#define HYB_MAX_SEQ_LEN 301
|
||
#define HYB_LEAF_NODE 7
|
||
|
||
// 节点类型
|
||
#define HYB_BP_1 0
|
||
#define HYB_BP_2 1
|
||
#define HYB_BP_3 2
|
||
#define HYB_BP_PATH 3
|
||
|
||
static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码
|
||
static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码
|
||
static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码
|
||
|
||
// 函数
|
||
#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK)
|
||
|
||
#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)};
|
||
|
||
#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos))
|
||
///////////////////////////////////////
|
||
// 结构体
|
||
|
||
// hybrid-index数据结构
|
||
typedef struct {
|
||
uint8_t* kmer_data; // kmer数据
|
||
uint8_t* index_data; // 二级索引数据
|
||
uint8_t* sa; // 后缀数组
|
||
uint8_t* ref_bits; // 2-bit编码的ref序列
|
||
uint64_t ref_len; // ref长度
|
||
} HybridIndex;
|
||
|
||
// Read序列需要的数据
|
||
typedef struct {
|
||
int len; // read长度
|
||
uint8_t* seq; // 每个碱基占一个字节(0-A, 1-C, 2-G, 3-G, 4-N)
|
||
uint8_t* rseq; // 反向互补序列
|
||
uint8_t* for_bits; // 正向序列 2-bit编码
|
||
uint8_t* back_bits; // 反向互补序列 2-bit编码
|
||
int id; // for test;
|
||
// char* seqstr;
|
||
} ReadSeq;
|
||
|
||
typedef kvec_t(ReadSeq) ReadSeqArr;
|
||
|
||
// seeding过程生成的SMEM
|
||
typedef struct {
|
||
int first_len; // 记录第一次匹配时的长度,应该是用来减少seeding-3的计算的
|
||
int seed_start; // 左闭右开区间,种子开始位置,对应在序列上的位置
|
||
int seed_end;
|
||
int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
|
||
int read_end;
|
||
uint64_v ref_pos_arr;
|
||
} HybSeed;
|
||
|
||
// smem数组
|
||
typedef kvec_t(HybSeed) HybSeedArr;
|
||
|
||
// read区间,用来处理包含N的read
|
||
// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
|
||
typedef struct {
|
||
int start;
|
||
int end;
|
||
int len;
|
||
} Range;
|
||
|
||
#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;}
|
||
|
||
#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end)
|
||
|
||
#define _range_cross(r1, r2) \
|
||
(((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end))
|
||
|
||
// read区间数组
|
||
typedef kvec_t(Range) RangeArr;
|
||
|
||
#define __check_add_seed(seed) \
|
||
HybSeed* seed = kv_pushp(HybSeed, *seeds); \
|
||
if (seeds->m > *seeds_cap) { \
|
||
uint64_t i = *seeds_cap; \
|
||
for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \
|
||
*seeds_cap = seeds->m; \
|
||
} \
|
||
seed->ref_pos_arr.n = 0;
|
||
|
||
#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
|
||
__check_add_seed(seed_var); \
|
||
kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \
|
||
seed_var->first_len = 0; \
|
||
seed_var->seed_start = s_start; \
|
||
seed_var->seed_end = s_end; \
|
||
seed_var->read_start = read_range->start; \
|
||
seed_var->read_end = read_range->end;
|
||
|
||
#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
|
||
if ((s_end) - (s_start) >= min_seed_len) { \
|
||
__add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \
|
||
}
|
||
|
||
#define __copy_seed(src, dst) \
|
||
do { \
|
||
(dst).seed_start = (src).seed_start; \
|
||
(dst).seed_end = (src).seed_end; \
|
||
(dst).read_start = (src).read_start; \
|
||
(dst).read_end = (src).read_end; \
|
||
int i; \
|
||
for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \
|
||
} while (0)
|
||
|
||
///////////////////////////////////////////////////////
|
||
// functions
|
||
|
||
// 加载hybrid index
|
||
// HybridIndex* load_hybrid_idx(const char* prefix);
|
||
|
||
// 创建正向反向互补bits
|
||
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re);
|
||
|
||
// 用于将seq和ref正向直接比较,返回匹配的长度
|
||
int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len);
|
||
|
||
// 用于将seq和ref反向直接比较,返回匹配的长度
|
||
int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos);
|
||
|
||
// 根据sa的行获取对应的ref position(小端模式)
|
||
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row);
|
||
|
||
// 解析第一个节点, 返回后续对应的节点地址
|
||
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
|
||
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
|
||
|
||
// 解析后续的正常节点
|
||
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
|
||
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
|
||
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
|
||
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
|
||
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
|
||
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
|
||
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
|
||
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid);
|
||
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset);
|
||
|
||
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match);
|
||
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match);
|
||
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match);
|
||
|
||
// 用hybrid-index来寻找smem(seeding-1 & seeding-2)
|
||
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
|
||
HybSeedArr* seeds, int tid);
|
||
|
||
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
|
||
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
|
||
const int min_seed_len, HybSeedArr* seeds, int tid);
|
||
|
||
// for seeding-3
|
||
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
|
||
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);
|