完成了hybrid index的创建代码,还没测试

This commit is contained in:
zzh 2025-11-08 14:44:52 +08:00
parent 74b464ee4a
commit caf01ce0f3
8 changed files with 1101 additions and 19 deletions

7
.vscode/launch.json vendored
View File

@ -92,13 +92,12 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"bwt2hybrid",
"-e",
"bwt2hyb",
"-t",
"1",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta"
"./index/human_g1k_v37_decoy.fasta"
],
"cwd": "${workspaceFolder}", //
},

View File

@ -106,7 +106,12 @@ mem_opt_t *mem_opt_init()
o->max_chain_extend = 1<<30;
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
bwa_fill_scmat(o->a, o->b, o->mat);
return o;
o->use_bwt = 0;
o->skip_entire_match = 0;
o->batch_size = 256;
return o;
}
/***************************

View File

@ -81,6 +81,9 @@ typedef struct {
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
int batch_size; // number of bases to process in each batch
int use_bwt; // whether to use bwt index for seeding
int skip_entire_match; // whether to skip the second and third seeding steps for entire matching reads
} mem_opt_t;
typedef struct {

View File

@ -150,13 +150,13 @@ int main_mem(int argc, char *argv[])
mem_pestat_t pes[4];
ktp_aux_t aux;
memset(&aux, 0, sizeof(ktp_aux_t));
memset(&aux, 0, sizeof(ktp_aux_t));
memset(pes, 0, 4 * sizeof(mem_pestat_t));
for (i = 0; i < 4; ++i) pes[i].failed = 1;
aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:we")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
@ -254,8 +254,12 @@ int main_mem(int argc, char *argv[])
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
}
else return 1;
} else if (c == 'b')
opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256;
else if (c == 'w')
opt->use_bwt = 1;
else if (c == 'e') opt->skip_entire_match = 1;
else return 1;
}
if (rg_line) {
@ -320,7 +324,10 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
fprintf(stderr, "\n");
fprintf(stderr, " -b INT batch size of reads to process at one time [%d].\n", opt->batch_size);
fprintf(stderr, " -w Use bwt index for seeding\n");
fprintf(stderr, " -e Skip the second and third seeding steps for entire matching reads.\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, "\n");
free(opt);

View File

@ -117,7 +117,7 @@ int bwa_extract_byte_sa(int argc, char* argv[]) {
}
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n", sa_intv);
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
@ -206,25 +206,56 @@ void bwt_create_kmer_idx(bwt_t* bwt) {
free(seq_arr);
}
// 从bwt中创建kmer index并保存到文件
int bwa_bwt2kmer(int argc, char* argv[]) {
bwt_t* bwt;
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n");
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n\n");
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_create_kmer_idx(bwt);
bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash);
return 0;
}
int bwa_bwt2hyb(int argc, char* argv[]) {}
// 创建hybrid index并保存到文件
int bwa_bwt2hyb(int argc, char* argv[]) {
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix);
bwt_t* bwt;
int num_threads = 1;
char c;
int error = 0;
while ((c = getopt(argc, argv, "t:")) >= 0) {
switch (c) {
case 't':
num_threads = atoi(optarg);
assert(num_threads > 0 && num_threads < 512);
break;
default:
error = 1;
break;
}
}
if (optind + 1 > argc || error) {
fprintf(stderr, "Usage: bwa bwt2hyb [Options] <bwt-prefix>\n\n");
fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads);
fprintf(stderr, "\n");
return 1;
}
char fn[MAX_PATH];
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
bwt = bwt_restore_bwt(fn);
snprintf(fn, MAX_PATH, "%s.kmer", argv[optind]);
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
snprintf(fn, MAX_PATH, "%s.hyb.bytesa", argv[optind]);
_load_file_to_data(fn, bwt->byte_sa);
hyb_idx_build_and_dump(num_threads, bwt, argv[optind]);
return 0;
}
// 创建正向的kmer
inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
uint64_t qbit = 0, i;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i) {
@ -239,7 +270,7 @@ inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int
int hyb_test(int argc, char* argv[]) {
bwt_t* bwt;
if (optind + 1 > argc) {
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n");
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n\n");
return 1;
}

View File

@ -0,0 +1,866 @@
/* The MIT License
Copyright (c) 2025- Zhonghai Zhang (ICT)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <zlib.h>
#include "bwt.h"
#include "hyb_idx.h"
#include "kvec.h"
#include "rle.h"
#include "rope.h"
#include "utils.h"
#define KMER_START(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)(tid) / (uint64_t)(num_thread))
#define KMER_END(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)((tid) + 1) / (uint64_t)(num_thread))
///////////////////////////
typedef struct {
uint64_t size;
uint64_t cap;
uint8_t* addr;
} BlockData;
#define _bd_init(bd) \
do { \
(bd).size = 0; \
(bd).cap = 0; \
(bd).addr = 0; \
} while (0)
#define _bd_claim(bd, add_size) \
do { \
if ((bd).size + add_size >= (bd).cap) { \
(bd).cap += add_size + 4L * 1024 * 1024; \
(bd).addr = (uint8_t*)realloc((bd).addr, (bd).cap); \
} \
} while (0)
#define _bd_addr(bd) ((bd).addr + (bd).size)
#define _bd_destory(bd) \
do { \
free((bd).addr); \
} while (0)
typedef struct {
uint32_t idx;
uint32_t cnt;
uint32_t bytes;
} idx_cnt_t;
// 构建hyb-index时候用到的节点数据结构
typedef struct _Node {
uint8_t header; // 1字节头部信息 [0]
uint8_t hit_bytes; // 用多少bytes表示hits
uint8_t off_bytes; // 用多少bytes表示子节点数据偏移量
uint8_t hits_neq; // 和子节点hits数量不相等
uint8_t hits_neq1; // 子节点和孙子节点hits数量不相等
uint8_t max_depth_leaf; // 是否达到最大深度的叶节点
uint8_t normal_leaf; // 正常的叶子节点hits=1
uint16_t seq_len; // 单一分支碱基序列长度如果分支那就是1
uint16_t depth; // 当前节点对应的位置0对应着16不包含16-kmer
uint16_t min_child_depth; // 最近到hit=1的深度
uint16_t max_child_depth; // 最远到hit=1或者到达最大深度时的深度
uint32_t hits; // 匹配个数
uint32_t child_num; // 有多少分支子节点, 第一层子节点个数
uint32_t num_offspring; // 所有后代节点个数
uint32_t min_offspring_hits; // 所有后代中最后最少的hits
uint32_t node_bytes; // 该节点包含所有后代所占内存
uint32_t child_bytes; // 子节点所占内存
uint8_t type; // 当前节点类型
uint64_t mark; // 与type对应4bit16bit64bit
uint64_t sa_pos; // 对应的sa起始位置
uint8_t seq[320]; // 保存所有单一分支的碱基序列
uint8_t kmer; // 当前节点对应的kmerroot节点是uint64用这个不准确
struct _Node* children[64]; // 子节点
struct _Node* parent; // 父节点
} Node;
// 并行构建hyb-index的数据结构
typedef struct {
pthread_t ptid; // 线程ID
int tid; // 线程id
uint64_t kmer_start; // [) 左闭右开
uint64_t kmer_end;
uint64_t bytes; // 当前线程所有kmer数据占用的内存字节数
uint8_t* kmer_data; // kmer数据
uint64_t prev_bd_size; // 前一个kmer对应的index数据所占内存大小用来调整多线程kmer数据对应的地址偏移量的
BlockData idata; // 二级索引数据
bwt_t* bwt; // bwt索引
} PartKmer;
typedef kvec_t(PartKmer) PartKmerVec;
// 当hit变成1后继续扩展节点为了保存2-kmer或者3-kmer对应的匹配信息
typedef struct {
Node* node;
uint64_t mark;
} MarkNode;
/////////////////
// 正向搜索kmerhyb的16-kmer在bwt中的区间信息
bwtintv_t bwt_search_kmer(const bwt_t* bwt, const uint8_t* q) {
bwtintv_t ik = {0}, ok[4] = {0};
int i, x, c;
uint32_t qbit = 0;
x = HASH_KMER_LEN;
for (i = 0; i < x; ++i) {
qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1);
}
bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, x - 1);
if (ik.x[2] < 1)
return ik;
for (i = x; i < HYB_KMER_LEN; ++i) {
c = 3 - q[i];
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] < 1)
return ok[c];
ik = ok[c];
}
return ik;
}
// 根据子节点更新父节点信息
static void update_parent(Node* n, Node* c, uint64_t k, int update_max) {
// 创建完子节点后,更新一些数据
n->children[n->child_num++] = c;
c->kmer = k;
c->parent = n;
n->mark |= (1ULL << k); // 设置当前节点的mark
n->min_offspring_hits = MIN(n->min_offspring_hits, c->min_offspring_hits);
n->min_child_depth = MIN(n->min_child_depth, c->min_child_depth + 1);
if (update_max)
n->max_child_depth = MAX(n->max_child_depth, c->max_child_depth + 1);
n->num_offspring += c->num_offspring + 1;
if (c->hits_neq)
n->hits_neq1 = 1; // 子节点和孙子节点hits数量不相等
}
// 清除父节点的一些信息, 不释放内存空间
static void clear_parent(Node* n) {
uint32_t i = 0;
n->max_child_depth = 0;
for (; i < n->child_num; ++i) n->children[i] = NULL; // 清除子节点
n->child_num = 0;
n->mark = 0;
n->num_offspring = 0;
n->hits_neq1 = 0;
}
// 更新节点信息
static void update_node_info(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth, int update_min) {
n->depth = depth; // 当前节点对应的深度
n->min_offspring_hits = ik->x[2]; // 所有子节点中最小的hits数量
n->sa_pos = ik->x[0]; // bwt_get_sa(pk->fmt->sa, ik->x[0]); // 对应的sa中第一个位置
n->hits = ik->x[2]; // 匹配数量
if (update_min)
n->min_child_depth = max_depth; // 初始化为当前节点所能达到的最大深度(人为设定的阈值)
}
// 普通的leaf节点扩展扩展到第2层和第3层
static void extend_leaf_node(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
n->normal_leaf = 1;
update_node_info(pk, n, ik, depth, max_depth, 0);
if (max_depth == 0) {
return;
}
uint8_t k;
int child_hits = 0;
bwtintv_t ok[4] = {0};
bwt_extend(pk->bwt, ik, ok, 0);
for (k = 0; k < 4; ++k) {
bwtintv_t iv = ok[3 - k];
child_hits += iv.x[2];
if (iv.x[2] > 0) { // hits必须大于0即有匹配的位置才创建子节点
Node* child = (Node*)calloc(1, sizeof(Node));
child->seq[child->seq_len++] = k; // 保存序列
// 递归构建子节点
extend_leaf_node(pk, child, &iv, depth + 1, max_depth - 1);
// 创建完子节点后,更新一些数据
update_parent(n, child, k, 0); // 更新父节点信息
break;
}
}
if (child_hits != (int)n->hits) {
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾导致节点hits和子节点hits总和不等
}
}
// 递归构建hyb-tree
// n: 父节点
// depth: kmer之后该碱基在第几个位置/深度)
// max_depth: 遍历到最大深度停止(最长有多少碱基, 从当前节点算起)
static void build_hyb_tree(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
/* 递归终止条件 */
if (ik->x[2] == 1) { // 如果hits为0或者1则不再需要子节点
n->min_child_depth = 0;
n->normal_leaf = 1; // 正常的叶子节点hits=1
extend_leaf_node(pk, n, ik, depth, 2); // 继续扩展两个碱基
return;
}
// 给节点n赋值
update_node_info(pk, n, ik, depth, max_depth, 1);
if (max_depth == 0) { // 达到最大深度,不再进行扩展
n->min_child_depth = 0;
n->max_depth_leaf = 1; // 达到最大深度的叶节点
return;
}
int child_hits = 0; // 所有子节点hits数量之和
uint8_t k; // 碱基
bwtintv_t ok[4] = {0};
bwt_extend(pk->bwt, ik, ok, 0);
for (k = 0; k < 4; ++k) { // 1-kmer
bwtintv_t iv = ok[3 - k];
child_hits += iv.x[2];
if (iv.x[2] > 0) { // hits必须大于0即有匹配的位置才创建子节点
Node* child = (Node*)calloc(1, sizeof(Node));
child->seq[child->seq_len++] = k;
// 递归构建子节点
build_hyb_tree(pk, child, &iv, depth + 1, max_depth - 1);
// 创建完子节点后,更新一些数据
update_parent(n, child, k, 1); // 更新父节点信息
}
}
// 子节点hits之和应该和父节点hits数量相等否则就是当前父节点有一个匹配到了ref的结尾
if (child_hits != (int)n->hits) {
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾导致节点hits和子节点hits总和不等
}
}
// 将ptr的子节点赋值给n并释放ptr节点
static void assign_children(Node* n, Node* ptr) {
clear_parent(n);
n->seq[n->seq_len++] = ptr->seq[0]; // 保存序列
uint32_t i = 0;
for (; i < ptr->child_num; ++i) {
update_parent(n, ptr->children[i], ptr->children[i]->seq[0], 1);
}
free(ptr); // 释放ptr节点
}
// 处理path node
static void handle_path_node(Node** n) {
Node* cur = *n;
Node* ptr = cur->children[0]; // 只有一个子节点
while (ptr->child_num == 1 && cur->hits == ptr->hits) { // 继续向下遍历,直到没有子节点, 或父子hits不一致
cur->seq[cur->seq_len++] = ptr->seq[0]; // 保存序列
Node* next = ptr->children[0]; // 获取下一个子节点
free(ptr); // 释放当前节点
ptr = next;
}
if (ptr->child_num == 1) { // hits不一致
clear_parent(cur);
update_parent(cur, ptr, ptr->seq[0], 1);
cur->hits_neq = 1; // 继承子节点的hits_neq
ptr->seq[ptr->seq_len++] = ptr->seq[0]; // 多保存一次该节点的碱基因为他也是path节点
*n = ptr;
handle_path_node(n); // 递归处理下一个path节点
} else {
assign_children(cur, ptr); // 将ptr的子节点赋值给n
}
}
// dfs遍历当前节点的子节点扩展到2或3层然后去掉中间层节点将后续节点直接与父节点连接, layer = 2 or 3
static void shrink_node_layer(PartKmer* pk, Node* n, int layer) {
kvec_t(MarkNode) layer_nodes_stack;
kv_init(layer_nodes_stack); // dfs遍历的stack
kvec_t(MarkNode) kmer_stack;
kv_init(kmer_stack); // 装所有的kmer
kvec_t(Node*) to_release;
kv_init(to_release); // 装所有的待删除的node
int i;
// init stack
for (i = (int)n->child_num - 1; i >= 0; --i) {
MarkNode ln;
ln.node = n->children[i];
ln.mark = 1; // 第一层子节点
kv_push(MarkNode, layer_nodes_stack, ln);
}
// dfs 遍历
while (kv_size(layer_nodes_stack) > 0) {
MarkNode ln = kv_pop(layer_nodes_stack);
if ((int)ln.mark == layer) { // 是第3层的节点创建对应的kmer
uint8_t k = 0;
if (layer == 2)
k = ln.node->parent->seq[0] << 2 | ln.node->seq[0];
else if (layer == 3)
k = ln.node->parent->parent->seq[0] << 4 | ln.node->parent->seq[0] << 2 | ln.node->seq[0];
MarkNode kmer_node;
kmer_node.node = ln.node;
kmer_node.mark = k; // 保存kmer
kv_push(MarkNode, kmer_stack, kmer_node); // 保存kmer
} else {
kv_push(Node*, to_release, ln.node); // 保存当前节点,待释放
for (i = (int)ln.node->child_num - 1; i >= 0; --i) {
MarkNode lnn;
lnn.node = ln.node->children[i];
lnn.mark = ln.mark + 1;
kv_push(MarkNode, layer_nodes_stack, lnn);
}
}
}
// 处理所有kmer
clear_parent(n);
int child_hits = 0;
for (i = 0; i < (int)kv_size(kmer_stack); ++i) {
MarkNode kmer_node = kv_A(kmer_stack, i);
child_hits += kmer_node.node->hits; // 统计子节点的hits数量
update_parent(n, kmer_node.node, kmer_node.mark, 1); // 更新父节点信息
}
if (child_hits != (int)n->hits) {
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾导致节点hits和子节点hits总和不等
}
for (i = 0; i < (int)kv_size(to_release); ++i) {
free(kv_A(to_release, i));
}
kv_destroy(layer_nodes_stack);
kv_destroy(kmer_stack);
kv_destroy(to_release);
}
// 构建完hybrid tree之后从根节点开始遍历设置节点的类型构建path节点321 kmer节点
static void update_hyb_tree(PartKmer* pk, Node* n) {
if (n->child_num == 1 && !n->normal_leaf && !n->max_depth_leaf && !(n->max_child_depth <= 3)) { // path类型
handle_path_node(&n);
}
uint32_t i = 0;
#define __recursive_update_hyb_tree(pk, n) \
for (i = 0; i < n->child_num; ++i) { \
update_hyb_tree(pk, n->children[i]); \
}
// 计算子节点的idata地址和父节点的match_cnt
// 递归终点
if (n->max_child_depth == 5) {
if (n->hits_neq) {
n->type = HYB_BP_1;
} else {
n->type = HYB_BP_2;
shrink_node_layer(pk, n, 2);
}
__recursive_update_hyb_tree(pk, n);
} else if (n->max_child_depth == 4) {
n->type = HYB_BP_1;
__recursive_update_hyb_tree(pk, n);
} else if (n->max_child_depth == 3) {
if (n->hits_neq) {
n->type = HYB_BP_1;
} else if (n->hits_neq1) {
n->type = HYB_BP_2;
shrink_node_layer(pk, n, 2);
} else {
n->type = HYB_BP_3;
shrink_node_layer(pk, n, 3);
}
} else if (n->max_child_depth == 2) {
if (n->hits_neq) {
n->type = HYB_BP_1;
} else {
n->type = HYB_BP_2;
shrink_node_layer(pk, n, 2);
}
} else if (n->max_child_depth == 1) {
n->type = HYB_BP_1;
} else {
if (n->hits_neq) {
n->type = HYB_BP_1;
} else if (n->hits_neq1) {
n->type = HYB_BP_2;
shrink_node_layer(pk, n, 2);
} else {
n->type = HYB_BP_3;
shrink_node_layer(pk, n, 3);
}
__recursive_update_hyb_tree(pk, n);
}
}
// 根据构建的radix tree创建hyb-index
static void calc_node_bytes(Node* n) {
// 递归终点
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
n->node_bytes = 0;
return;
}
// 先处理子节点
uint32_t i = 0;
for (; i < n->child_num; ++i) {
calc_node_bytes(n->children[i]); // 递归构建子节点
n->child_bytes += n->children[i]->node_bytes; // 累加子节点所占内存
}
// 处理当前节点
n->node_bytes = n->child_bytes;
if (n->depth == 0) n->node_bytes += 5; // sa_pos占用5字节
// 计算偏移地址字节
int off_bytes = 0;
if (n->child_bytes == 0) { // 没有子节点
off_bytes = 0;
} else if (n->child_bytes < 256) {
off_bytes = 1; // 子节点数据偏移量小于256, 1B就行
} else if (n->child_bytes < 65536) {
off_bytes = 2; // 2B就行
} else if (n->child_bytes < 16777216) {
off_bytes = 3; // 3B就行
} else if (n->child_bytes < UINT32_MAX) {
off_bytes = 4; // 使用4B
} else {
fprintf(stderr, "too many child bytes: %d\n", n->child_bytes);
exit(1);
}
n->off_bytes = off_bytes; // 保存子节点数据偏移量字节数
// 计算hits字节
int hits_bytes = 0;
if (n->hits < 256) {
hits_bytes = 1; // hits小于256, 占用1字节
} else if (n->hits < 65536) {
hits_bytes = 2; // hits小于65536, 占用2字节
} else if (n->hits < 16777216) {
hits_bytes = 3; // hits小于16777216, 占用3字节
} else if (n->child_bytes < UINT32_MAX) {
hits_bytes = 4; // 使用4B
} else {
fprintf(stderr, "too many hits: %d\n", n->hits);
exit(1);
}
n->hit_bytes = hits_bytes; // 保存hits字节数
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
n->node_bytes += hits_bytes; // 根节点需要保存hits
}
if (n->seq_len > 1) { // path节点
n->node_bytes += 1; // header, path节点相当于两个节点一个path一个普通的
n->node_bytes += 1; // 结合header 记录碱基path长度代替mark
n->node_bytes += ((n->seq_len - 1) * 2 + 7) / 8; // 只有根节点保存本节点的碱基其他的节点不保存seq[0]因为seq[0]是子节点的碱基
}
// 有可能一个path下边还是一个path因为可能父子hits不一致
if (n->child_num > 1 || n->seq_len == 1) { // 有多个子节点或者当前节点不是path节点
n->node_bytes += 1; // header_bits占用1字节
// 叶节点只用bit-map来计算hits偏移就行了, 不用保存hits start信息
int all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
if (!all_child_is_leaf) {
n->node_bytes += (hits_bytes + off_bytes) * (n->child_num - 1); // 不保存第一个子节点offset因为是0
// for test首个节点不保存hits start和offset因为hits start是0或者1offset是0这样index-data减少了1GB总共20GB
// 这样不能根据概率调整内存位置了
}
if (n->type == HYB_BP_1) { // 1-kmer节点
n->node_bytes += 1; // 1字节的mark
} else if (n->type == HYB_BP_2) { // 2-kmer节点
n->node_bytes += 2; // 2字节的mark
} else if (n->type == HYB_BP_3) { // 3-kmer节点
n->node_bytes += 8; // 8字节的mark
} else {
fprintf(stderr, "Unknown node type: %d\n", (int)n->type);
exit(1);
}
}
}
// 释放所有子节点和当前节点
static void release_node(Node* p) {
uint32_t k = 0;
for (k = 0; k < p->child_num; k++) {
release_node(p->children[k]);
}
free(p);
}
void write_seq_bits(uint8_t* bs, int seq_len, BlockData* idata) {
if (seq_len > 0) {
uint32_t num_bytes = ((seq_len << 1) + 7) >> 3;
uint8_t* bit_seq = (uint8_t*)_bd_addr(*idata);
uint8_t bybp = 0;
int i = 0, j = 0, idx = 0;
for (; i + 3 < seq_len; i += 4) {
bybp = bs[i] | bs[i + 1] << 2 | bs[i + 2] << 4 | bs[i + 3] << 6;
*(uint8_t*)&bit_seq[idx++] = bybp;
}
bybp = 0;
for (; i < seq_len; ++i, ++j) {
bybp |= bs[i] << j * 2;
}
*(uint8_t*)&bit_seq[idx++] = bybp;
idata->size += num_bytes; // 更新数据大小
}
}
static void write_normal_node_data(Node* n, BlockData* idata, uint8_t write_sa) {
uint8_t* ptr;
int i, j;
int all_child_is_leaf = 0;
// 2bit子节点类型, 1bit父子hits是否一致2bit hits字节数;
uint8_t header = n->type << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
// 叶节点只用bit-map来计算hits偏移就行了
all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
// 3bit子节点地址偏移字节数
if (all_child_is_leaf) {
header |= HYB_LEAF_NODE; // 用off_bytes = 7来表示叶节点不用读取hits和offset信息
} else {
header |= n->off_bytes;
// assert(n->off_bytes != 0); // 这里会出错,为啥有这个断言?
}
// header
idata->addr[idata->size++] = header; // 写入header
// sa_pos
if (n->depth == 0 && write_sa) {
ptr = (uint8_t*)&n->sa_pos;
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入sa_pos
}
// hits
if (write_sa && n->depth == 0 && n->hits > HYB_HIT_THRESH) {
ptr = (uint8_t*)&n->hits;
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入hits
}
// 处理mark
if (n->type == HYB_BP_1) {
idata->addr[idata->size++] = (uint8_t)(n->mark & 0xFF); // 1-kmer节点
} else if (n->type == HYB_BP_2) {
ptr = (uint8_t*)&n->mark;
for (i = 0; i < 2; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 2-kmer节点
} else if (n->type == HYB_BP_3) {
ptr = (uint8_t*)&n->mark;
for (i = 0; i < 8; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 3-kmer节点
}
if (all_child_is_leaf) {
return; // 叶节点只用bit-map来计算hits start就行了
}
// 处理子节点
uint32_t sa_start = 0;
uint32_t offset = 0; // 子节点数据偏移量
// 剩下的节点
for (j = 1; j < (int)n->child_num; ++j) {
sa_start += n->children[j - 1]->hits; // 累加前一个子节点的hits
ptr = (uint8_t*)&sa_start;
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入sa_start
if (n->children[j]->node_bytes == 0 && n->children[j]->hits > 1 && !n->children[j]->max_depth_leaf) {
fprintf(stderr, "Error: no child! hits:%u, depth:%u, max-leaf:%u\n", n->children[j]->hits, n->children[j]->depth,
n->children[j]->max_depth_leaf);
}
offset += n->children[j - 1]->node_bytes;
ptr = (uint8_t*)&offset;
for (i = 0; i < n->off_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入offset
}
}
// 写入path node数据一个path node包含path和normal两种数据
static void write_path_node_data(Node* n, BlockData* idata) {
// 2bit子节点类型, 1bit父子hits是否一致2bit hits字节数;
uint8_t header = HYB_BP_PATH << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
int path_len = n->seq_len - 1; // path节点的长度
header |= path_len >> 8; // 此时header最低位保存path_len的高8位9bit足以表示path的长度
idata->addr[idata->size++] = header; // 写入header
int i;
uint8_t* ptr;
if (n->depth == 0) {
ptr = (uint8_t*)&n->sa_pos;
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入sa_pos
}
// 处理hits
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
ptr = (uint8_t*)&n->hits;
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
idata->size += i; // 写入hits
}
// 处理mark
idata->addr[idata->size++] = (uint8_t)(path_len & 0xFF); // 写入mark
// 写入seq
write_seq_bits(n->seq + 1, path_len, idata); // 跳过seq[0]因为seq[0]是子节点的碱基
// 继续把该节点当做正常节点处理
if (n->child_num > 1)
write_normal_node_data(n, idata, 0); // 写入正常节点数据
}
/*
header-1B: 2bit1bithits2bit hits3bit
*/
static void build_index_from_node(Node* n, BlockData* idata) {
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
return;
}
uint32_t i = 0;
_bd_claim(*idata, n->node_bytes); // 确保有足够的内存空间
// 先处理当前节点
// 1. 写入header
// 2. 如果是根节点写入sa_pos
// 3. 写入hits
// 4. 写入mark
// 5. 写入子节点sa_pos偏移量和子节点数据偏移量
uint32_t node_self_size = idata->size;
if (n->seq_len > 1) {
write_path_node_data(n, idata); // 写入path节点数据
} else {
write_normal_node_data(n, idata, 1); // 写入正常节点数据
}
node_self_size = idata->size - node_self_size; // 当前节点数据大小
if (node_self_size != n->node_bytes - n->child_bytes) {
fprintf(stderr, "Error: node bytes not match: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n", node_self_size,
n->node_bytes - n->child_bytes, n->node_bytes, n->depth, n->max_child_depth, n->seq_len, n->hits, n->child_num,
n->hits_neq, (int)n->type);
Node* p = n;
fprintf(stderr, "Info: father neq: %u, %u, %u\n", p->parent->hits_neq, p->parent->child_num, p->parent->hits);
while (p != NULL) {
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
p = p->children[0]; // 只打印第一个子节点
}
p = n;
while (p->depth > 0) {
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
p = p->parent; // 向上打印父节点
}
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth,
p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
exit(0);
}
for (i = 0; i < n->child_num; ++i) { // 这里需要按照匹配概率来修改
build_index_from_node(n->children[i], idata); // 递归构建子节点
}
}
// 多线程创建hybrid index
static void* thread_build_hyb_index(void* arg) {
PartKmer* p = (PartKmer*)arg;
uint8_t kmer[HYB_KMER_LEN + 1];
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
uint64_t percent_mod = all_kmer_num / 100;
int percent = 0;
if (percent_mod == 0)
percent_mod = 1;
uint8_t type_hits = 0;
uint64_t ref_pos = 0;
uint64_t sa_forward = 0;
uint64_t idata_offset = 0;
uint64_t packed_data = 0;
uint8_t* ptr = (uint8_t*)&packed_data;
uint64_t i, j, kmer_size;
fprintf(stderr, "[thread %d] start building hybrid index for kmer range [%lu, %lu)\n", p->tid, p->kmer_start, p->kmer_end);
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_BYTES;
for (j = 0; j < HYB_KMER_LEN; ++j) {
kmer[j] = (i >> (j << 1)) & 3;
}
bwtintv_t iv = bwt_search_kmer(p->bwt, kmer);
const uint32_t khits = iv.x[2]; // 有多少个匹配的位置
if (khits == 0) {
// no extra bytes
packed_data = 0;
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
} else if (khits == 1) { // 只有一个匹配,直接保存对应的参考基因位置
// no extra bytes
type_hits = 1;
ref_pos = bwt_get_sa(p->bwt->byte_sa, iv.x[0]); // 直接记录kmer对应的reference pos
packed_data = (ref_pos << HYB_KMER_DATA_TYPE_BITS) | type_hits;
// 这里需要注意packed_data是小端模式的, 5个字节
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
} else if (khits == 2) { // 只有两个匹配直接保存对应的suffix array位置
// no extra bytes
type_hits = 2;
sa_forward = iv.x[0]; // 记录kmer对应的后缀数组的位置
packed_data = (sa_forward << HYB_KMER_DATA_TYPE_BITS) | type_hits;
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
} else {
Node* root = (Node*)calloc(1, sizeof(Node));
root->seq[root->seq_len++] = kmer[HYB_KMER_LEN - 1];
build_hyb_tree(p, root, &iv, 0, HYB_MAX_SEQ_LEN - HYB_KMER_LEN); // 创建基数树
update_hyb_tree(p, root); // 更新hybrid tree
idata_offset = p->idata.size; // 指向当前要用的地址偏移量(起始位置)
calc_node_bytes(root); // 计算每个节点所占内存
build_index_from_node(root, &p->idata); // 构建hybrid index
// 将type和地址偏移信息写入kmer
if (khits <= HYB_HIT_THRESH) { // hits数量小于等于30
type_hits = khits; // hits数量
} else {
type_hits = HYB_HIT_THRESH + 1;
}
packed_data = (idata_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
p->bytes += root->node_bytes; // 累加当前节点所占内存
release_node(root); // 释放所有子节点和当前节点
}
if (p->tid == 0 && kmer_size % percent_mod == 0) {
fprintf(stderr, "[thread %d] %d %% done, bytes: %fM\n", p->tid, percent++, (double)p->bytes / 1024 / 1024);
}
}
fprintf(stderr, "Finish-[thread %d] bytes: %fM\n", p->tid, (double)p->bytes / 1024 / 1024);
return NULL;
}
// 多线程调整hybrid index中的block data偏移地址整合成连续的一块地址空间
static void* thread_adjust_hyb_index(void* arg) {
PartKmer* p = (PartKmer*)arg;
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
uint64_t percent_mod = all_kmer_num / 100;
int percent = 0;
if (percent_mod == 0) percent_mod = 1;
uint8_t type_hits = 0;
uint64_t bd_offset = 0;
uint64_t packed_data = 0;
uint8_t* ptr = (uint8_t*)&packed_data;
uint64_t i, kmer_size;
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_TYPE_BITS;
type_hits = *(kd)&HYB_KMER_DATA_TYPE_MASK;
if (type_hits > 2) { // block data index数据需要改offset
// bd_offset = ((*(uint64_t *)kd) >> kTypeBits) & p->KMER_MASK; // 指向当前要用的地址偏移量
bd_offset = ((*(uint64_t*)kd) & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; // 指向当前要用的地址偏移量
bd_offset += p->prev_bd_size;
packed_data = (bd_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
}
if (p->tid == 0 && kmer_size % percent_mod == 0) {
fprintf(stderr, "Info: %d%% done adjust.\n", percent++);
}
}
return NULL;
}
// 保存hybrid index
static void save_hybrid_idx(const char* prefix, PartKmerVec* pkArr) {
// fprintf(stderr, "size: %ld\n", kv_size(*pkArr));
// return;
char fn[MAX_PATH];
char fn2[MAX_PATH];
sprintf(fn, "%s.hyb.kmer", prefix);
sprintf(fn2, "%s.hyb.data", prefix);
FILE* fpk = fopen(fn, "wb");
FILE* fpbd = fopen(fn2, "wb");
if (fpk == NULL) {
fprintf(stderr, "Error: open file %s failed\n", fn);
return;
}
if (fpbd == NULL) {
fprintf(stderr, "Error: open file %s failed", fn2);
return;
}
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
err_fwrite(kv_A(*pkArr, 0).kmer_data, 1, max_kmer * HYB_KMER_DATA_BYTES, fpk); // 小端
int i;
for (i = 0; i < kv_size(*pkArr); ++i) {
err_fwrite(kv_A(*pkArr, i).idata.addr, 1, kv_A(*pkArr, i).idata.size, fpbd);
}
fclose(fpk);
fclose(fpbd);
}
// 构建hybrid-index入口函数
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix) {
PartKmerVec pk_arr;
kv_init(pk_arr);
kv_resize(PartKmer, pk_arr, num_threads);
pk_arr.n = num_threads;
kvec_t(pthread_t) tid_arr;
kv_init(tid_arr);
kv_resize(pthread_t, tid_arr, num_threads);
tid_arr.n = num_threads;
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
uint8_t* kmer_data = (uint8_t*)malloc(max_kmer * HYB_KMER_DATA_BYTES);
int i;
for (i = 0; i < num_threads; ++i) {
PartKmer* p = &kv_A(pk_arr, i);
p->tid = i;
p->kmer_start = KMER_START(i, max_kmer, num_threads);
p->kmer_end = KMER_END(i, max_kmer, num_threads);
p->bytes = 0;
p->kmer_data = kmer_data;
p->prev_bd_size = 0;
_bd_init(p->idata);
p->bwt = bwt;
pthread_create(&kv_A(tid_arr, i), NULL, thread_build_hyb_index, p);
}
uint64_t all_bytes = 0;
uint64_t prev_bd_size = 0;
for (i = 0; i < num_threads; ++i) {
pthread_join(kv_A(tid_arr, i), NULL);
PartKmer* p = &kv_A(pk_arr, i);
all_bytes += p->bytes;
p->prev_bd_size = prev_bd_size;
prev_bd_size += p->idata.size;
}
for (i = 0; i < num_threads; ++i) {
pthread_create(&kv_A(tid_arr, i), NULL, thread_adjust_hyb_index, &kv_A(pk_arr, i));
}
for (int i = 0; i < num_threads; ++i) {
pthread_join(kv_A(tid_arr, i), NULL);
}
fprintf(stderr, "Hybrid index total bytes: %fM\n", (double)all_bytes / 1024 / 1024);
double time_sava_hybrid_idx = realtime();
save_hybrid_idx(idx_prefix, &pk_arr);
time_sava_hybrid_idx = realtime() - time_sava_hybrid_idx;
fprintf(stderr, "Time save hybrid index: %fs\n", time_sava_hybrid_idx);
kv_destroy(pk_arr);
kv_destroy(tid_arr);
return 0;
}

170
hyb_idx.h
View File

@ -9,3 +9,173 @@
#include "kvec.h"
#include "utils.h"
///////////////////////////////////////////
// 宏定义
// 数据
#define HYB_KMER_LEN 16
#define HYB_KMER_MASK 4294967295ULL
#define HYB_KMER_DATA_BYTES 5 // 5bytes
#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes
#define HYB_KMER_DATA_TYPE_BITS 5
#define HYB_KMER_DATA_TYPE_MASK 31
#define HYB_HIT_THRESH 30
#define HYB_NODE_SA_MASK 1099511627775ULL
#define HYB_MAX_SEQ_LEN 301
#define HYB_LEAF_NODE 7
// 节点类型
#define HYB_BP_1 0
#define HYB_BP_2 1
#define HYB_BP_3 2
#define HYB_BP_PATH 3
static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码
static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码
static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码
// 函数
#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK)
#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)};
#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos))
///////////////////////////////////////
// 结构体
// hybrid-index数据结构
typedef struct {
uint8_t* kmer_data; // kmer数据
uint8_t* index_data; // 二级索引数据
uint8_t* sa; // 后缀数组
uint8_t* ref_bits; // 2-bit编码的ref序列
uint64_t ref_len; // ref长度
} HybridIndex;
// Read序列需要的数据
typedef struct {
int len; // read长度
uint8_t* seq; // 每个碱基占一个字节0-A, 1-C, 2-G, 3-G, 4-N
uint8_t* rseq; // 反向互补序列
uint8_t* for_bits; // 正向序列 2-bit编码
uint8_t* back_bits; // 反向互补序列 2-bit编码
int id; // for test;
char* seqstr;
} ReadSeq;
typedef kvec_t(ReadSeq) ReadSeqArr;
// seeding过程生成的SMEM
typedef struct {
int first_len;
int seed_start; // 左闭右开区间
int seed_end;
int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
int read_end;
uint64_v ref_pos_arr;
} HybSeed;
// smem数组
typedef kvec_t(HybSeed) HybSeedArr;
// read区间用来处理包含N的read
// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
typedef struct {
int start;
int end;
int len;
} Range;
#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;}
#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end)
#define _range_cross(r1, r2) \
(((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end))
// read区间数组
typedef kvec_t(Range) RangeArr;
#define __check_add_seed(seed) \
HybSeed* seed = kv_pushp(HybSeed, *seeds); \
if (seeds->m > *seeds_cap) { \
uint64_t i = *seeds_cap; \
for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \
*seeds_cap = seeds->m; \
} \
seed->ref_pos_arr.n = 0;
#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
__check_add_seed(seed_var); \
kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \
seed_var->first_len = 0; \
seed_var->seed_start = s_start; \
seed_var->seed_end = s_end; \
seed_var->read_start = read_range->start; \
seed_var->read_end = read_range->end;
#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
if ((s_end) - (s_start) >= min_seed_len) { \
__add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \
}
#define __copy_seed(src, dst) \
do { \
(dst).seed_start = (src).seed_start; \
(dst).seed_end = (src).seed_end; \
(dst).read_start = (src).read_start; \
(dst).read_end = (src).read_end; \
int i; \
for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \
} while (0)
///////////////////////////////////////////////////////
// functions
// 加载hybrid index
HybridIndex* load_hybrid_idx(const char* prefix);
// 创建正向反向互补bits
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re);
// 用于将seq和ref正向直接比较返回匹配的长度
int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len);
// 用于将seq和ref反向直接比较返回匹配的长度
int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos);
// 根据sa的行获取对应的ref position小端模式
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row);
// 解析第一个节点, 返回后续对应的节点地址
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
// 解析后续的正常节点
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid);
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset);
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match);
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match);
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match);
// 用hybrid-index来寻找smem(seeding-1 & seeding-2)
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid);
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
const int min_seed_len, HybSeedArr* seeds, int tid);
// for seeding-3
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);

View File

@ -12,9 +12,10 @@ Date : 2024/04/06
#include "profiling.h"
#include "debug.h"
uint64_t proc_freq = 1000;
#ifdef SHOW_PERF
uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0};
uint64_t proc_freq = 1000;
uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0};
#endif