866 lines
34 KiB
C
866 lines
34 KiB
C
/* The MIT License
|
||
|
||
Copyright (c) 2025- Zhonghai Zhang (ICT)
|
||
|
||
Permission is hereby granted, free of charge, to any person obtaining
|
||
a copy of this software and associated documentation files (the
|
||
"Software"), to deal in the Software without restriction, including
|
||
without limitation the rights to use, copy, modify, merge, publish,
|
||
distribute, sublicense, and/or sell copies of the Software, and to
|
||
permit persons to whom the Software is furnished to do so, subject to
|
||
the following conditions:
|
||
|
||
The above copyright notice and this permission notice shall be
|
||
included in all copies or substantial portions of the Software.
|
||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
SOFTWARE.
|
||
*/
|
||
|
||
#include <assert.h>
|
||
#include <pthread.h>
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <time.h>
|
||
#include <unistd.h>
|
||
#include <zlib.h>
|
||
|
||
#include "bwt.h"
|
||
#include "hyb_idx.h"
|
||
#include "kvec.h"
|
||
#include "rle.h"
|
||
#include "rope.h"
|
||
#include "utils.h"
|
||
|
||
#define KMER_START(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)(tid) / (uint64_t)(num_thread))
|
||
#define KMER_END(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)((tid) + 1) / (uint64_t)(num_thread))
|
||
|
||
///////////////////////////
|
||
|
||
typedef struct {
|
||
uint64_t size;
|
||
uint64_t cap;
|
||
uint8_t* addr;
|
||
} BlockData;
|
||
|
||
#define _bd_init(bd) \
|
||
do { \
|
||
(bd).size = 0; \
|
||
(bd).cap = 0; \
|
||
(bd).addr = 0; \
|
||
} while (0)
|
||
|
||
#define _bd_claim(bd, add_size) \
|
||
do { \
|
||
if ((bd).size + add_size >= (bd).cap) { \
|
||
(bd).cap += add_size + 4L * 1024 * 1024; \
|
||
(bd).addr = (uint8_t*)realloc((bd).addr, (bd).cap); \
|
||
} \
|
||
} while (0)
|
||
|
||
#define _bd_addr(bd) ((bd).addr + (bd).size)
|
||
|
||
#define _bd_destory(bd) \
|
||
do { \
|
||
free((bd).addr); \
|
||
} while (0)
|
||
|
||
typedef struct {
|
||
uint32_t idx;
|
||
uint32_t cnt;
|
||
uint32_t bytes;
|
||
} idx_cnt_t;
|
||
|
||
// 构建hyb-index时候用到的节点数据结构
|
||
typedef struct _Node {
|
||
uint8_t header; // 1字节头部信息 [0]
|
||
uint8_t hit_bytes; // 用多少bytes表示hits
|
||
uint8_t off_bytes; // 用多少bytes表示子节点数据偏移量
|
||
|
||
uint8_t hits_neq; // 和子节点hits数量不相等
|
||
uint8_t hits_neq1; // 子节点和孙子节点hits数量不相等
|
||
uint8_t max_depth_leaf; // 是否达到最大深度的叶节点
|
||
uint8_t normal_leaf; // 正常的叶子节点,hits=1
|
||
|
||
uint16_t seq_len; // 单一分支碱基序列长度,如果分支,那就是1
|
||
uint16_t depth; // 当前节点对应的位置,0对应着16(不包含16-kmer)
|
||
uint16_t min_child_depth; // 最近到hit=1的深度
|
||
uint16_t max_child_depth; // 最远到hit=1或者到达最大深度时的深度
|
||
|
||
uint32_t hits; // 匹配个数
|
||
uint32_t child_num; // 有多少分支子节点, 第一层子节点个数
|
||
uint32_t num_offspring; // 所有后代节点个数
|
||
uint32_t min_offspring_hits; // 所有后代中,最后最少的hits
|
||
uint32_t node_bytes; // 该节点包含所有后代所占内存
|
||
uint32_t child_bytes; // 子节点所占内存
|
||
|
||
uint8_t type; // 当前节点类型
|
||
uint64_t mark; // 与type对应,4bit,16bit,64bit
|
||
uint64_t sa_pos; // 对应的sa起始位置
|
||
uint8_t seq[320]; // 保存所有单一分支的碱基序列
|
||
|
||
uint8_t kmer; // 当前节点对应的kmer,root节点是uint64,用这个不准确
|
||
|
||
struct _Node* children[64]; // 子节点
|
||
struct _Node* parent; // 父节点
|
||
} Node;
|
||
|
||
// 并行构建hyb-index的数据结构
|
||
typedef struct {
|
||
pthread_t ptid; // 线程ID
|
||
int tid; // 线程id
|
||
uint64_t kmer_start; // [) 左闭右开
|
||
uint64_t kmer_end;
|
||
uint64_t bytes; // 当前线程所有kmer数据占用的内存字节数
|
||
uint8_t* kmer_data; // kmer数据
|
||
uint64_t prev_bd_size; // 前一个kmer对应的index数据所占内存大小,用来调整多线程kmer数据对应的地址偏移量的
|
||
BlockData idata; // 二级索引数据
|
||
bwt_t* bwt; // bwt索引
|
||
} PartKmer;
|
||
|
||
typedef kvec_t(PartKmer) PartKmerVec;
|
||
|
||
// 当hit变成1后,继续扩展节点,为了保存2-kmer或者3-kmer对应的匹配信息
|
||
typedef struct {
|
||
Node* node;
|
||
uint64_t mark;
|
||
} MarkNode;
|
||
|
||
/////////////////
|
||
|
||
// 正向搜索kmer(hyb的16-kmer)在bwt中的区间信息
|
||
bwtintv_t bwt_search_kmer(const bwt_t* bwt, const uint8_t* q) {
|
||
bwtintv_t ik = {0}, ok[4] = {0};
|
||
int i, x, c;
|
||
uint32_t qbit = 0;
|
||
x = HASH_KMER_LEN;
|
||
for (i = 0; i < x; ++i) {
|
||
qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1);
|
||
}
|
||
bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, x - 1);
|
||
if (ik.x[2] < 1)
|
||
return ik;
|
||
for (i = x; i < HYB_KMER_LEN; ++i) {
|
||
c = 3 - q[i];
|
||
bwt_extend(bwt, &ik, ok, 0);
|
||
if (ok[c].x[2] < 1)
|
||
return ok[c];
|
||
ik = ok[c];
|
||
}
|
||
return ik;
|
||
}
|
||
|
||
// 根据子节点更新父节点信息
|
||
static void update_parent(Node* n, Node* c, uint64_t k, int update_max) {
|
||
// 创建完子节点后,更新一些数据
|
||
n->children[n->child_num++] = c;
|
||
c->kmer = k;
|
||
c->parent = n;
|
||
n->mark |= (1ULL << k); // 设置当前节点的mark
|
||
n->min_offspring_hits = MIN(n->min_offspring_hits, c->min_offspring_hits);
|
||
n->min_child_depth = MIN(n->min_child_depth, c->min_child_depth + 1);
|
||
if (update_max)
|
||
n->max_child_depth = MAX(n->max_child_depth, c->max_child_depth + 1);
|
||
n->num_offspring += c->num_offspring + 1;
|
||
if (c->hits_neq)
|
||
n->hits_neq1 = 1; // 子节点和孙子节点hits数量不相等
|
||
}
|
||
|
||
// 清除父节点的一些信息, 不释放内存空间
|
||
static void clear_parent(Node* n) {
|
||
uint32_t i = 0;
|
||
n->max_child_depth = 0;
|
||
for (; i < n->child_num; ++i) n->children[i] = NULL; // 清除子节点
|
||
n->child_num = 0;
|
||
n->mark = 0;
|
||
n->num_offspring = 0;
|
||
n->hits_neq1 = 0;
|
||
}
|
||
|
||
// 更新节点信息
|
||
static void update_node_info(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth, int update_min) {
|
||
n->depth = depth; // 当前节点对应的深度
|
||
n->min_offspring_hits = ik->x[2]; // 所有子节点中最小的hits数量
|
||
n->sa_pos = ik->x[0]; // bwt_get_sa(pk->fmt->sa, ik->x[0]); // 对应的sa中第一个位置
|
||
n->hits = ik->x[2]; // 匹配数量
|
||
if (update_min)
|
||
n->min_child_depth = max_depth; // 初始化为当前节点所能达到的最大深度(人为设定的阈值)
|
||
}
|
||
|
||
// 普通的leaf节点扩展,扩展到第2层和第3层
|
||
static void extend_leaf_node(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
|
||
n->normal_leaf = 1;
|
||
update_node_info(pk, n, ik, depth, max_depth, 0);
|
||
if (max_depth == 0) {
|
||
return;
|
||
}
|
||
uint8_t k;
|
||
int child_hits = 0;
|
||
bwtintv_t ok[4] = {0};
|
||
bwt_extend(pk->bwt, ik, ok, 0);
|
||
for (k = 0; k < 4; ++k) {
|
||
bwtintv_t iv = ok[3 - k];
|
||
child_hits += iv.x[2];
|
||
if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点
|
||
Node* child = (Node*)calloc(1, sizeof(Node));
|
||
child->seq[child->seq_len++] = k; // 保存序列
|
||
// 递归构建子节点
|
||
extend_leaf_node(pk, child, &iv, depth + 1, max_depth - 1);
|
||
// 创建完子节点后,更新一些数据
|
||
update_parent(n, child, k, 0); // 更新父节点信息
|
||
break;
|
||
}
|
||
}
|
||
if (child_hits != (int)n->hits) {
|
||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||
}
|
||
}
|
||
|
||
// 递归构建hyb-tree
|
||
// n: 父节点
|
||
// depth: kmer之后该碱基在第几个(位置/深度)
|
||
// max_depth: 遍历到最大深度停止(最长有多少碱基, 从当前节点算起)
|
||
static void build_hyb_tree(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
|
||
/* 递归终止条件 */
|
||
if (ik->x[2] == 1) { // 如果hits为0或者1,则不再需要子节点
|
||
n->min_child_depth = 0;
|
||
n->normal_leaf = 1; // 正常的叶子节点,hits=1
|
||
extend_leaf_node(pk, n, ik, depth, 2); // 继续扩展两个碱基
|
||
return;
|
||
}
|
||
// 给节点n赋值
|
||
update_node_info(pk, n, ik, depth, max_depth, 1);
|
||
if (max_depth == 0) { // 达到最大深度,不再进行扩展
|
||
n->min_child_depth = 0;
|
||
n->max_depth_leaf = 1; // 达到最大深度的叶节点
|
||
return;
|
||
}
|
||
int child_hits = 0; // 所有子节点hits数量之和
|
||
uint8_t k; // 碱基
|
||
bwtintv_t ok[4] = {0};
|
||
bwt_extend(pk->bwt, ik, ok, 0);
|
||
for (k = 0; k < 4; ++k) { // 1-kmer
|
||
bwtintv_t iv = ok[3 - k];
|
||
child_hits += iv.x[2];
|
||
if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点
|
||
Node* child = (Node*)calloc(1, sizeof(Node));
|
||
child->seq[child->seq_len++] = k;
|
||
// 递归构建子节点
|
||
build_hyb_tree(pk, child, &iv, depth + 1, max_depth - 1);
|
||
// 创建完子节点后,更新一些数据
|
||
update_parent(n, child, k, 1); // 更新父节点信息
|
||
}
|
||
}
|
||
// 子节点hits之和应该和父节点hits数量相等,否则,就是当前父节点有一个匹配到了ref的结尾
|
||
if (child_hits != (int)n->hits) {
|
||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||
}
|
||
}
|
||
|
||
// 将ptr的子节点赋值给n,并释放ptr节点
|
||
static void assign_children(Node* n, Node* ptr) {
|
||
clear_parent(n);
|
||
n->seq[n->seq_len++] = ptr->seq[0]; // 保存序列
|
||
uint32_t i = 0;
|
||
for (; i < ptr->child_num; ++i) {
|
||
update_parent(n, ptr->children[i], ptr->children[i]->seq[0], 1);
|
||
}
|
||
free(ptr); // 释放ptr节点
|
||
}
|
||
|
||
// 处理path node
|
||
static void handle_path_node(Node** n) {
|
||
Node* cur = *n;
|
||
Node* ptr = cur->children[0]; // 只有一个子节点
|
||
while (ptr->child_num == 1 && cur->hits == ptr->hits) { // 继续向下遍历,直到没有子节点, 或父子hits不一致
|
||
cur->seq[cur->seq_len++] = ptr->seq[0]; // 保存序列
|
||
Node* next = ptr->children[0]; // 获取下一个子节点
|
||
free(ptr); // 释放当前节点
|
||
ptr = next;
|
||
}
|
||
if (ptr->child_num == 1) { // hits不一致
|
||
clear_parent(cur);
|
||
update_parent(cur, ptr, ptr->seq[0], 1);
|
||
cur->hits_neq = 1; // 继承子节点的hits_neq
|
||
ptr->seq[ptr->seq_len++] = ptr->seq[0]; // 多保存一次该节点的碱基,因为他也是path节点
|
||
*n = ptr;
|
||
handle_path_node(n); // 递归处理下一个path节点
|
||
} else {
|
||
assign_children(cur, ptr); // 将ptr的子节点赋值给n
|
||
}
|
||
}
|
||
|
||
// dfs遍历当前节点的子节点,扩展到2或3层,然后去掉中间层节点,将后续节点直接与父节点连接, layer = 2 or 3
|
||
static void shrink_node_layer(PartKmer* pk, Node* n, int layer) {
|
||
kvec_t(MarkNode) layer_nodes_stack;
|
||
kv_init(layer_nodes_stack); // dfs遍历的stack
|
||
kvec_t(MarkNode) kmer_stack;
|
||
kv_init(kmer_stack); // 装所有的kmer
|
||
kvec_t(Node*) to_release;
|
||
kv_init(to_release); // 装所有的待删除的node
|
||
|
||
int i;
|
||
// init stack
|
||
for (i = (int)n->child_num - 1; i >= 0; --i) {
|
||
MarkNode ln;
|
||
ln.node = n->children[i];
|
||
ln.mark = 1; // 第一层子节点
|
||
kv_push(MarkNode, layer_nodes_stack, ln);
|
||
}
|
||
// dfs 遍历
|
||
while (kv_size(layer_nodes_stack) > 0) {
|
||
MarkNode ln = kv_pop(layer_nodes_stack);
|
||
if ((int)ln.mark == layer) { // 是第3层的节点,创建对应的kmer
|
||
uint8_t k = 0;
|
||
if (layer == 2)
|
||
k = ln.node->parent->seq[0] << 2 | ln.node->seq[0];
|
||
else if (layer == 3)
|
||
k = ln.node->parent->parent->seq[0] << 4 | ln.node->parent->seq[0] << 2 | ln.node->seq[0];
|
||
MarkNode kmer_node;
|
||
kmer_node.node = ln.node;
|
||
kmer_node.mark = k; // 保存kmer
|
||
kv_push(MarkNode, kmer_stack, kmer_node); // 保存kmer
|
||
} else {
|
||
kv_push(Node*, to_release, ln.node); // 保存当前节点,待释放
|
||
for (i = (int)ln.node->child_num - 1; i >= 0; --i) {
|
||
MarkNode lnn;
|
||
lnn.node = ln.node->children[i];
|
||
lnn.mark = ln.mark + 1;
|
||
kv_push(MarkNode, layer_nodes_stack, lnn);
|
||
}
|
||
}
|
||
}
|
||
// 处理所有kmer
|
||
clear_parent(n);
|
||
int child_hits = 0;
|
||
for (i = 0; i < (int)kv_size(kmer_stack); ++i) {
|
||
MarkNode kmer_node = kv_A(kmer_stack, i);
|
||
child_hits += kmer_node.node->hits; // 统计子节点的hits数量
|
||
update_parent(n, kmer_node.node, kmer_node.mark, 1); // 更新父节点信息
|
||
}
|
||
if (child_hits != (int)n->hits) {
|
||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||
}
|
||
for (i = 0; i < (int)kv_size(to_release); ++i) {
|
||
free(kv_A(to_release, i));
|
||
}
|
||
kv_destroy(layer_nodes_stack);
|
||
kv_destroy(kmer_stack);
|
||
kv_destroy(to_release);
|
||
}
|
||
|
||
// 构建完hybrid tree之后,从根节点开始遍历,设置节点的类型,构建path节点,3,2,1 kmer节点
|
||
static void update_hyb_tree(PartKmer* pk, Node* n) {
|
||
if (n->child_num == 1 && !n->normal_leaf && !n->max_depth_leaf && !(n->max_child_depth <= 3)) { // path类型
|
||
handle_path_node(&n);
|
||
}
|
||
uint32_t i = 0;
|
||
#define __recursive_update_hyb_tree(pk, n) \
|
||
for (i = 0; i < n->child_num; ++i) { \
|
||
update_hyb_tree(pk, n->children[i]); \
|
||
}
|
||
// 计算子节点的idata地址,和父节点的match_cnt
|
||
// 递归终点
|
||
if (n->max_child_depth == 5) {
|
||
if (n->hits_neq) {
|
||
n->type = HYB_BP_1;
|
||
} else {
|
||
n->type = HYB_BP_2;
|
||
shrink_node_layer(pk, n, 2);
|
||
}
|
||
__recursive_update_hyb_tree(pk, n);
|
||
} else if (n->max_child_depth == 4) {
|
||
n->type = HYB_BP_1;
|
||
__recursive_update_hyb_tree(pk, n);
|
||
} else if (n->max_child_depth == 3) {
|
||
if (n->hits_neq) {
|
||
n->type = HYB_BP_1;
|
||
} else if (n->hits_neq1) {
|
||
n->type = HYB_BP_2;
|
||
shrink_node_layer(pk, n, 2);
|
||
} else {
|
||
n->type = HYB_BP_3;
|
||
shrink_node_layer(pk, n, 3);
|
||
}
|
||
} else if (n->max_child_depth == 2) {
|
||
if (n->hits_neq) {
|
||
n->type = HYB_BP_1;
|
||
} else {
|
||
n->type = HYB_BP_2;
|
||
shrink_node_layer(pk, n, 2);
|
||
}
|
||
} else if (n->max_child_depth == 1) {
|
||
n->type = HYB_BP_1;
|
||
} else {
|
||
if (n->hits_neq) {
|
||
n->type = HYB_BP_1;
|
||
} else if (n->hits_neq1) {
|
||
n->type = HYB_BP_2;
|
||
shrink_node_layer(pk, n, 2);
|
||
} else {
|
||
n->type = HYB_BP_3;
|
||
shrink_node_layer(pk, n, 3);
|
||
}
|
||
__recursive_update_hyb_tree(pk, n);
|
||
}
|
||
}
|
||
|
||
// 根据构建的radix tree,创建hyb-index
|
||
static void calc_node_bytes(Node* n) {
|
||
// 递归终点
|
||
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
|
||
n->node_bytes = 0;
|
||
return;
|
||
}
|
||
// 先处理子节点
|
||
uint32_t i = 0;
|
||
for (; i < n->child_num; ++i) {
|
||
calc_node_bytes(n->children[i]); // 递归构建子节点
|
||
n->child_bytes += n->children[i]->node_bytes; // 累加子节点所占内存
|
||
}
|
||
// 处理当前节点
|
||
n->node_bytes = n->child_bytes;
|
||
if (n->depth == 0) n->node_bytes += 5; // sa_pos占用5字节
|
||
|
||
// 计算偏移地址字节
|
||
int off_bytes = 0;
|
||
if (n->child_bytes == 0) { // 没有子节点
|
||
off_bytes = 0;
|
||
} else if (n->child_bytes < 256) {
|
||
off_bytes = 1; // 子节点数据偏移量小于256, 1B就行
|
||
} else if (n->child_bytes < 65536) {
|
||
off_bytes = 2; // 2B就行
|
||
} else if (n->child_bytes < 16777216) {
|
||
off_bytes = 3; // 3B就行
|
||
} else if (n->child_bytes < UINT32_MAX) {
|
||
off_bytes = 4; // 使用4B
|
||
} else {
|
||
fprintf(stderr, "too many child bytes: %d\n", n->child_bytes);
|
||
exit(1);
|
||
}
|
||
n->off_bytes = off_bytes; // 保存子节点数据偏移量字节数
|
||
|
||
// 计算hits字节
|
||
int hits_bytes = 0;
|
||
if (n->hits < 256) {
|
||
hits_bytes = 1; // hits小于256, 占用1字节
|
||
} else if (n->hits < 65536) {
|
||
hits_bytes = 2; // hits小于65536, 占用2字节
|
||
} else if (n->hits < 16777216) {
|
||
hits_bytes = 3; // hits小于16777216, 占用3字节
|
||
} else if (n->child_bytes < UINT32_MAX) {
|
||
hits_bytes = 4; // 使用4B
|
||
} else {
|
||
fprintf(stderr, "too many hits: %d\n", n->hits);
|
||
exit(1);
|
||
}
|
||
n->hit_bytes = hits_bytes; // 保存hits字节数
|
||
|
||
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||
n->node_bytes += hits_bytes; // 根节点需要保存hits
|
||
}
|
||
|
||
if (n->seq_len > 1) { // path节点
|
||
n->node_bytes += 1; // header, path节点相当于两个节点,一个path,一个普通的
|
||
n->node_bytes += 1; // 结合header 记录碱基path长度,代替mark
|
||
n->node_bytes += ((n->seq_len - 1) * 2 + 7) / 8; // 只有根节点保存本节点的碱基,其他的节点不保存seq[0],因为seq[0]是子节点的碱基
|
||
}
|
||
// 有可能一个path下边还是一个path,因为可能父子hits不一致
|
||
if (n->child_num > 1 || n->seq_len == 1) { // 有多个子节点,或者当前节点不是path节点
|
||
n->node_bytes += 1; // header_bits占用1字节
|
||
// 叶节点只用bit-map来计算hits偏移就行了, 不用保存hits start信息
|
||
int all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
|
||
if (!all_child_is_leaf) {
|
||
n->node_bytes += (hits_bytes + off_bytes) * (n->child_num - 1); // 不保存第一个子节点offset,因为是0
|
||
// for test,首个节点不保存hits start和offset,因为hits start是0或者1,offset是0,这样index-data减少了1GB,总共20GB
|
||
// 这样不能根据概率调整内存位置了
|
||
}
|
||
if (n->type == HYB_BP_1) { // 1-kmer节点
|
||
n->node_bytes += 1; // 1字节的mark
|
||
} else if (n->type == HYB_BP_2) { // 2-kmer节点
|
||
n->node_bytes += 2; // 2字节的mark
|
||
} else if (n->type == HYB_BP_3) { // 3-kmer节点
|
||
n->node_bytes += 8; // 8字节的mark
|
||
} else {
|
||
fprintf(stderr, "Unknown node type: %d\n", (int)n->type);
|
||
exit(1);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 释放所有子节点和当前节点
|
||
static void release_node(Node* p) {
|
||
uint32_t k = 0;
|
||
for (k = 0; k < p->child_num; k++) {
|
||
release_node(p->children[k]);
|
||
}
|
||
free(p);
|
||
}
|
||
|
||
void write_seq_bits(uint8_t* bs, int seq_len, BlockData* idata) {
|
||
if (seq_len > 0) {
|
||
uint32_t num_bytes = ((seq_len << 1) + 7) >> 3;
|
||
uint8_t* bit_seq = (uint8_t*)_bd_addr(*idata);
|
||
uint8_t bybp = 0;
|
||
int i = 0, j = 0, idx = 0;
|
||
for (; i + 3 < seq_len; i += 4) {
|
||
bybp = bs[i] | bs[i + 1] << 2 | bs[i + 2] << 4 | bs[i + 3] << 6;
|
||
*(uint8_t*)&bit_seq[idx++] = bybp;
|
||
}
|
||
bybp = 0;
|
||
for (; i < seq_len; ++i, ++j) {
|
||
bybp |= bs[i] << j * 2;
|
||
}
|
||
*(uint8_t*)&bit_seq[idx++] = bybp;
|
||
idata->size += num_bytes; // 更新数据大小
|
||
}
|
||
}
|
||
|
||
static void write_normal_node_data(Node* n, BlockData* idata, uint8_t write_sa) {
|
||
uint8_t* ptr;
|
||
int i, j;
|
||
int all_child_is_leaf = 0;
|
||
|
||
// 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数;
|
||
uint8_t header = n->type << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
|
||
// 叶节点只用bit-map来计算hits偏移就行了
|
||
all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
|
||
// 3bit子节点地址偏移字节数
|
||
if (all_child_is_leaf) {
|
||
header |= HYB_LEAF_NODE; // 用off_bytes = 7来表示叶节点,不用读取hits和offset信息
|
||
} else {
|
||
header |= n->off_bytes;
|
||
// assert(n->off_bytes != 0); // 这里会出错,为啥有这个断言?
|
||
}
|
||
|
||
// header
|
||
idata->addr[idata->size++] = header; // 写入header
|
||
// sa_pos
|
||
if (n->depth == 0 && write_sa) {
|
||
ptr = (uint8_t*)&n->sa_pos;
|
||
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入sa_pos
|
||
}
|
||
// hits
|
||
if (write_sa && n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||
ptr = (uint8_t*)&n->hits;
|
||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入hits
|
||
}
|
||
|
||
// 处理mark
|
||
if (n->type == HYB_BP_1) {
|
||
idata->addr[idata->size++] = (uint8_t)(n->mark & 0xFF); // 1-kmer节点
|
||
} else if (n->type == HYB_BP_2) {
|
||
ptr = (uint8_t*)&n->mark;
|
||
for (i = 0; i < 2; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 2-kmer节点
|
||
} else if (n->type == HYB_BP_3) {
|
||
ptr = (uint8_t*)&n->mark;
|
||
for (i = 0; i < 8; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 3-kmer节点
|
||
}
|
||
|
||
if (all_child_is_leaf) {
|
||
return; // 叶节点只用bit-map来计算hits start就行了
|
||
}
|
||
|
||
// 处理子节点
|
||
uint32_t sa_start = 0;
|
||
uint32_t offset = 0; // 子节点数据偏移量
|
||
// 剩下的节点
|
||
for (j = 1; j < (int)n->child_num; ++j) {
|
||
sa_start += n->children[j - 1]->hits; // 累加前一个子节点的hits
|
||
ptr = (uint8_t*)&sa_start;
|
||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入sa_start
|
||
if (n->children[j]->node_bytes == 0 && n->children[j]->hits > 1 && !n->children[j]->max_depth_leaf) {
|
||
fprintf(stderr, "Error: no child! hits:%u, depth:%u, max-leaf:%u\n", n->children[j]->hits, n->children[j]->depth,
|
||
n->children[j]->max_depth_leaf);
|
||
}
|
||
offset += n->children[j - 1]->node_bytes;
|
||
ptr = (uint8_t*)&offset;
|
||
for (i = 0; i < n->off_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入offset
|
||
}
|
||
}
|
||
|
||
// 写入path node数据,一个path node包含path和normal两种数据
|
||
static void write_path_node_data(Node* n, BlockData* idata) {
|
||
// 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数;
|
||
uint8_t header = HYB_BP_PATH << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
|
||
int path_len = n->seq_len - 1; // path节点的长度
|
||
header |= path_len >> 8; // 此时,header最低位保存path_len的高8位(9bit足以表示path的长度)
|
||
idata->addr[idata->size++] = header; // 写入header
|
||
int i;
|
||
uint8_t* ptr;
|
||
if (n->depth == 0) {
|
||
ptr = (uint8_t*)&n->sa_pos;
|
||
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入sa_pos
|
||
}
|
||
// 处理hits
|
||
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||
ptr = (uint8_t*)&n->hits;
|
||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||
idata->size += i; // 写入hits
|
||
}
|
||
// 处理mark
|
||
idata->addr[idata->size++] = (uint8_t)(path_len & 0xFF); // 写入mark
|
||
// 写入seq
|
||
write_seq_bits(n->seq + 1, path_len, idata); // 跳过seq[0],因为seq[0]是子节点的碱基
|
||
// 继续把该节点当做正常节点处理
|
||
if (n->child_num > 1)
|
||
write_normal_node_data(n, idata, 0); // 写入正常节点数据
|
||
}
|
||
|
||
/*
|
||
header-1B: 2bit子节点类型,1bit父子hits是否一致,2bit hits字节,3bit子节点数据偏移量字节,
|
||
*/
|
||
static void build_index_from_node(Node* n, BlockData* idata) {
|
||
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
|
||
return;
|
||
}
|
||
uint32_t i = 0;
|
||
_bd_claim(*idata, n->node_bytes); // 确保有足够的内存空间
|
||
|
||
// 先处理当前节点
|
||
// 1. 写入header
|
||
// 2. 如果是根节点,写入sa_pos
|
||
// 3. 写入hits
|
||
// 4. 写入mark
|
||
// 5. 写入子节点sa_pos偏移量和子节点数据偏移量
|
||
uint32_t node_self_size = idata->size;
|
||
if (n->seq_len > 1) {
|
||
write_path_node_data(n, idata); // 写入path节点数据
|
||
} else {
|
||
write_normal_node_data(n, idata, 1); // 写入正常节点数据
|
||
}
|
||
node_self_size = idata->size - node_self_size; // 当前节点数据大小
|
||
if (node_self_size != n->node_bytes - n->child_bytes) {
|
||
fprintf(stderr, "Error: node bytes not match: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n", node_self_size,
|
||
n->node_bytes - n->child_bytes, n->node_bytes, n->depth, n->max_child_depth, n->seq_len, n->hits, n->child_num,
|
||
n->hits_neq, (int)n->type);
|
||
Node* p = n;
|
||
fprintf(stderr, "Info: father neq: %u, %u, %u\n", p->parent->hits_neq, p->parent->child_num, p->parent->hits);
|
||
while (p != NULL) {
|
||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
|
||
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||
p = p->children[0]; // 只打印第一个子节点
|
||
}
|
||
p = n;
|
||
while (p->depth > 0) {
|
||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
|
||
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||
p = p->parent; // 向上打印父节点
|
||
}
|
||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth,
|
||
p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||
|
||
exit(0);
|
||
}
|
||
|
||
for (i = 0; i < n->child_num; ++i) { // 这里需要按照匹配概率来修改
|
||
build_index_from_node(n->children[i], idata); // 递归构建子节点
|
||
}
|
||
}
|
||
|
||
// 多线程创建hybrid index
|
||
static void* thread_build_hyb_index(void* arg) {
|
||
PartKmer* p = (PartKmer*)arg;
|
||
uint8_t kmer[HYB_KMER_LEN + 1];
|
||
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
|
||
uint64_t percent_mod = all_kmer_num / 100;
|
||
int percent = 0;
|
||
if (percent_mod == 0)
|
||
percent_mod = 1;
|
||
|
||
uint8_t type_hits = 0;
|
||
uint64_t ref_pos = 0;
|
||
uint64_t sa_forward = 0;
|
||
uint64_t idata_offset = 0;
|
||
uint64_t packed_data = 0;
|
||
uint8_t* ptr = (uint8_t*)&packed_data;
|
||
uint64_t i, j, kmer_size;
|
||
fprintf(stderr, "[thread %d] start building hybrid index for kmer range [%lu, %lu)\n", p->tid, p->kmer_start, p->kmer_end);
|
||
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
|
||
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_BYTES;
|
||
for (j = 0; j < HYB_KMER_LEN; ++j) {
|
||
kmer[j] = (i >> (j << 1)) & 3;
|
||
}
|
||
bwtintv_t iv = bwt_search_kmer(p->bwt, kmer);
|
||
const uint32_t khits = iv.x[2]; // 有多少个匹配的位置
|
||
if (khits == 0) {
|
||
// no extra bytes
|
||
packed_data = 0;
|
||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||
} else if (khits == 1) { // 只有一个匹配,直接保存对应的参考基因位置
|
||
// no extra bytes
|
||
type_hits = 1;
|
||
ref_pos = bwt_get_sa(p->bwt->byte_sa, iv.x[0]); // 直接记录kmer对应的reference pos
|
||
packed_data = (ref_pos << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||
// 这里需要注意,packed_data是小端模式的, 5个字节
|
||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||
} else if (khits == 2) { // 只有两个匹配,直接保存对应的suffix array位置
|
||
// no extra bytes
|
||
type_hits = 2;
|
||
sa_forward = iv.x[0]; // 记录kmer对应的后缀数组的位置
|
||
packed_data = (sa_forward << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||
} else {
|
||
Node* root = (Node*)calloc(1, sizeof(Node));
|
||
root->seq[root->seq_len++] = kmer[HYB_KMER_LEN - 1];
|
||
build_hyb_tree(p, root, &iv, 0, HYB_MAX_SEQ_LEN - HYB_KMER_LEN); // 创建基数树
|
||
update_hyb_tree(p, root); // 更新hybrid tree
|
||
idata_offset = p->idata.size; // 指向当前要用的地址偏移量(起始位置)
|
||
calc_node_bytes(root); // 计算每个节点所占内存
|
||
build_index_from_node(root, &p->idata); // 构建hybrid index
|
||
// 将type和地址偏移信息写入kmer
|
||
if (khits <= HYB_HIT_THRESH) { // hits数量小于等于30
|
||
type_hits = khits; // hits数量
|
||
} else {
|
||
type_hits = HYB_HIT_THRESH + 1;
|
||
}
|
||
packed_data = (idata_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||
p->bytes += root->node_bytes; // 累加当前节点所占内存
|
||
release_node(root); // 释放所有子节点和当前节点
|
||
}
|
||
|
||
if (p->tid == 0 && kmer_size % percent_mod == 0) {
|
||
fprintf(stderr, "[thread %d] %d %% done, bytes: %fM\n", p->tid, percent++, (double)p->bytes / 1024 / 1024);
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "Finish-[thread %d] bytes: %fM\n", p->tid, (double)p->bytes / 1024 / 1024);
|
||
return NULL;
|
||
}
|
||
|
||
// 多线程调整hybrid index中的block data偏移地址,整合成连续的一块地址空间
|
||
static void* thread_adjust_hyb_index(void* arg) {
|
||
PartKmer* p = (PartKmer*)arg;
|
||
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
|
||
uint64_t percent_mod = all_kmer_num / 100;
|
||
int percent = 0;
|
||
if (percent_mod == 0) percent_mod = 1;
|
||
uint8_t type_hits = 0;
|
||
uint64_t bd_offset = 0;
|
||
uint64_t packed_data = 0;
|
||
uint8_t* ptr = (uint8_t*)&packed_data;
|
||
uint64_t i, kmer_size;
|
||
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
|
||
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_TYPE_BITS;
|
||
type_hits = *(kd)&HYB_KMER_DATA_TYPE_MASK;
|
||
if (type_hits > 2) { // block data index数据,需要改offset
|
||
// bd_offset = ((*(uint64_t *)kd) >> kTypeBits) & p->KMER_MASK; // 指向当前要用的地址偏移量
|
||
bd_offset = ((*(uint64_t*)kd) & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; // 指向当前要用的地址偏移量
|
||
bd_offset += p->prev_bd_size;
|
||
packed_data = (bd_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||
}
|
||
|
||
if (p->tid == 0 && kmer_size % percent_mod == 0) {
|
||
fprintf(stderr, "Info: %d%% done adjust.\n", percent++);
|
||
}
|
||
}
|
||
return NULL;
|
||
}
|
||
|
||
// 保存hybrid index
|
||
static void save_hybrid_idx(const char* prefix, PartKmerVec* pkArr) {
|
||
// fprintf(stderr, "size: %ld\n", kv_size(*pkArr));
|
||
// return;
|
||
char fn[MAX_PATH];
|
||
char fn2[MAX_PATH];
|
||
|
||
sprintf(fn, "%s.hyb.kmer", prefix);
|
||
sprintf(fn2, "%s.hyb.data", prefix);
|
||
|
||
FILE* fpk = fopen(fn, "wb");
|
||
FILE* fpbd = fopen(fn2, "wb");
|
||
|
||
if (fpk == NULL) {
|
||
fprintf(stderr, "Error: open file %s failed\n", fn);
|
||
return;
|
||
}
|
||
if (fpbd == NULL) {
|
||
fprintf(stderr, "Error: open file %s failed", fn2);
|
||
return;
|
||
}
|
||
|
||
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
|
||
err_fwrite(kv_A(*pkArr, 0).kmer_data, 1, max_kmer * HYB_KMER_DATA_BYTES, fpk); // 小端
|
||
int i;
|
||
for (i = 0; i < kv_size(*pkArr); ++i) {
|
||
err_fwrite(kv_A(*pkArr, i).idata.addr, 1, kv_A(*pkArr, i).idata.size, fpbd);
|
||
}
|
||
fclose(fpk);
|
||
fclose(fpbd);
|
||
}
|
||
|
||
// 构建hybrid-index入口函数
|
||
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix) {
|
||
PartKmerVec pk_arr;
|
||
kv_init(pk_arr);
|
||
kv_resize(PartKmer, pk_arr, num_threads);
|
||
pk_arr.n = num_threads;
|
||
|
||
kvec_t(pthread_t) tid_arr;
|
||
kv_init(tid_arr);
|
||
kv_resize(pthread_t, tid_arr, num_threads);
|
||
tid_arr.n = num_threads;
|
||
|
||
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
|
||
uint8_t* kmer_data = (uint8_t*)malloc(max_kmer * HYB_KMER_DATA_BYTES);
|
||
|
||
int i;
|
||
for (i = 0; i < num_threads; ++i) {
|
||
PartKmer* p = &kv_A(pk_arr, i);
|
||
p->tid = i;
|
||
p->kmer_start = KMER_START(i, max_kmer, num_threads);
|
||
p->kmer_end = KMER_END(i, max_kmer, num_threads);
|
||
p->bytes = 0;
|
||
p->kmer_data = kmer_data;
|
||
p->prev_bd_size = 0;
|
||
_bd_init(p->idata);
|
||
p->bwt = bwt;
|
||
pthread_create(&kv_A(tid_arr, i), NULL, thread_build_hyb_index, p);
|
||
}
|
||
|
||
uint64_t all_bytes = 0;
|
||
uint64_t prev_bd_size = 0;
|
||
for (i = 0; i < num_threads; ++i) {
|
||
pthread_join(kv_A(tid_arr, i), NULL);
|
||
PartKmer* p = &kv_A(pk_arr, i);
|
||
all_bytes += p->bytes;
|
||
p->prev_bd_size = prev_bd_size;
|
||
prev_bd_size += p->idata.size;
|
||
}
|
||
|
||
for (i = 0; i < num_threads; ++i) {
|
||
pthread_create(&kv_A(tid_arr, i), NULL, thread_adjust_hyb_index, &kv_A(pk_arr, i));
|
||
}
|
||
for (int i = 0; i < num_threads; ++i) {
|
||
pthread_join(kv_A(tid_arr, i), NULL);
|
||
}
|
||
|
||
fprintf(stderr, "Hybrid index total bytes: %fM\n", (double)all_bytes / 1024 / 1024);
|
||
|
||
double time_sava_hybrid_idx = realtime();
|
||
save_hybrid_idx(idx_prefix, &pk_arr);
|
||
time_sava_hybrid_idx = realtime() - time_sava_hybrid_idx;
|
||
|
||
fprintf(stderr, "Time save hybrid index: %fs\n", time_sava_hybrid_idx);
|
||
|
||
kv_destroy(pk_arr);
|
||
kv_destroy(tid_arr);
|
||
|
||
return 0;
|
||
} |