/* The MIT License Copyright (c) 2025- Zhonghai Zhang (ICT) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include "bwt.h" #include "hyb_idx.h" #include "kvec.h" #include "rle.h" #include "rope.h" #include "utils.h" #define KMER_START(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)(tid) / (uint64_t)(num_thread)) #define KMER_END(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)((tid) + 1) / (uint64_t)(num_thread)) /////////////////////////// typedef struct { uint64_t size; uint64_t cap; uint8_t* addr; } BlockData; #define _bd_init(bd) \ do { \ (bd).size = 0; \ (bd).cap = 0; \ (bd).addr = 0; \ } while (0) #define _bd_claim(bd, add_size) \ do { \ if ((bd).size + add_size >= (bd).cap) { \ (bd).cap += add_size + 4L * 1024 * 1024; \ (bd).addr = (uint8_t*)realloc((bd).addr, (bd).cap); \ } \ } while (0) #define _bd_addr(bd) ((bd).addr + (bd).size) #define _bd_destory(bd) \ do { \ free((bd).addr); \ } while (0) typedef struct { uint32_t idx; uint32_t cnt; uint32_t bytes; } idx_cnt_t; // 构建hyb-index时候用到的节点数据结构 typedef struct _Node { uint8_t header; // 1字节头部信息 [0] uint8_t hit_bytes; // 用多少bytes表示hits uint8_t off_bytes; // 用多少bytes表示子节点数据偏移量 uint8_t hits_neq; // 和子节点hits数量不相等 uint8_t hits_neq1; // 子节点和孙子节点hits数量不相等 uint8_t max_depth_leaf; // 是否达到最大深度的叶节点 uint8_t normal_leaf; // 正常的叶子节点,hits=1 uint16_t seq_len; // 单一分支碱基序列长度,如果分支,那就是1 uint16_t depth; // 当前节点对应的位置,0对应着16(不包含16-kmer) uint16_t min_child_depth; // 最近到hit=1的深度 uint16_t max_child_depth; // 最远到hit=1或者到达最大深度时的深度 uint32_t hits; // 匹配个数 uint32_t child_num; // 有多少分支子节点, 第一层子节点个数 uint32_t num_offspring; // 所有后代节点个数 uint32_t min_offspring_hits; // 所有后代中,最后最少的hits uint32_t node_bytes; // 该节点包含所有后代所占内存 uint32_t child_bytes; // 子节点所占内存 uint8_t type; // 当前节点类型 uint64_t mark; // 与type对应,4bit,16bit,64bit uint64_t sa_pos; // 对应的sa起始位置 uint8_t seq[320]; // 保存所有单一分支的碱基序列 uint8_t kmer; // 当前节点对应的kmer,root节点是uint64,用这个不准确 struct _Node* children[64]; // 子节点 struct _Node* parent; // 父节点 } Node; // 并行构建hyb-index的数据结构 typedef struct { pthread_t ptid; // 线程ID int tid; // 线程id uint64_t kmer_start; // [) 左闭右开 uint64_t kmer_end; uint64_t bytes; // 当前线程所有kmer数据占用的内存字节数 uint8_t* kmer_data; // kmer数据 uint64_t prev_bd_size; // 前一个kmer对应的index数据所占内存大小,用来调整多线程kmer数据对应的地址偏移量的 BlockData idata; // 二级索引数据 bwt_t* bwt; // bwt索引 } PartKmer; typedef kvec_t(PartKmer) PartKmerVec; // 当hit变成1后,继续扩展节点,为了保存2-kmer或者3-kmer对应的匹配信息 typedef struct { Node* node; uint64_t mark; } MarkNode; ///////////////// // 正向搜索kmer(hyb的16-kmer)在bwt中的区间信息 bwtintv_t bwt_search_kmer(const bwt_t* bwt, const uint8_t* q) { bwtintv_t ik = {0}, ok[4] = {0}; int i, x, c; uint32_t qbit = 0; x = HASH_KMER_LEN; for (i = 0; i < x; ++i) { qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1); } bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, x - 1); if (ik.x[2] < 1) return ik; for (i = x; i < HYB_KMER_LEN; ++i) { c = 3 - q[i]; bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] < 1) return ok[c]; ik = ok[c]; } return ik; } // 根据子节点更新父节点信息 static void update_parent(Node* n, Node* c, uint64_t k, int update_max) { // 创建完子节点后,更新一些数据 n->children[n->child_num++] = c; c->kmer = k; c->parent = n; n->mark |= (1ULL << k); // 设置当前节点的mark n->min_offspring_hits = MIN(n->min_offspring_hits, c->min_offspring_hits); n->min_child_depth = MIN(n->min_child_depth, c->min_child_depth + 1); if (update_max) n->max_child_depth = MAX(n->max_child_depth, c->max_child_depth + 1); n->num_offspring += c->num_offspring + 1; if (c->hits_neq) n->hits_neq1 = 1; // 子节点和孙子节点hits数量不相等 } // 清除父节点的一些信息, 不释放内存空间 static void clear_parent(Node* n) { uint32_t i = 0; n->max_child_depth = 0; for (; i < n->child_num; ++i) n->children[i] = NULL; // 清除子节点 n->child_num = 0; n->mark = 0; n->num_offspring = 0; n->hits_neq1 = 0; } // 更新节点信息 static void update_node_info(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth, int update_min) { n->depth = depth; // 当前节点对应的深度 n->min_offspring_hits = ik->x[2]; // 所有子节点中最小的hits数量 n->sa_pos = ik->x[0]; // bwt_get_sa(pk->fmt->sa, ik->x[0]); // 对应的sa中第一个位置 n->hits = ik->x[2]; // 匹配数量 if (update_min) n->min_child_depth = max_depth; // 初始化为当前节点所能达到的最大深度(人为设定的阈值) } // 普通的leaf节点扩展,扩展到第2层和第3层 static void extend_leaf_node(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) { n->normal_leaf = 1; update_node_info(pk, n, ik, depth, max_depth, 0); if (max_depth == 0) { return; } uint8_t k; int child_hits = 0; bwtintv_t ok[4] = {0}; bwt_extend(pk->bwt, ik, ok, 0); for (k = 0; k < 4; ++k) { bwtintv_t iv = ok[3 - k]; child_hits += iv.x[2]; if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点 Node* child = (Node*)calloc(1, sizeof(Node)); child->seq[child->seq_len++] = k; // 保存序列 // 递归构建子节点 extend_leaf_node(pk, child, &iv, depth + 1, max_depth - 1); // 创建完子节点后,更新一些数据 update_parent(n, child, k, 0); // 更新父节点信息 break; } } if (child_hits != (int)n->hits) { n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 } } // 递归构建hyb-tree // n: 父节点 // depth: kmer之后该碱基在第几个(位置/深度) // max_depth: 遍历到最大深度停止(最长有多少碱基, 从当前节点算起) static void build_hyb_tree(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) { /* 递归终止条件 */ if (ik->x[2] == 1) { // 如果hits为0或者1,则不再需要子节点 n->min_child_depth = 0; n->normal_leaf = 1; // 正常的叶子节点,hits=1 extend_leaf_node(pk, n, ik, depth, 2); // 继续扩展两个碱基 return; } // 给节点n赋值 update_node_info(pk, n, ik, depth, max_depth, 1); if (max_depth == 0) { // 达到最大深度,不再进行扩展 n->min_child_depth = 0; n->max_depth_leaf = 1; // 达到最大深度的叶节点 return; } int child_hits = 0; // 所有子节点hits数量之和 uint8_t k; // 碱基 bwtintv_t ok[4] = {0}; bwt_extend(pk->bwt, ik, ok, 0); for (k = 0; k < 4; ++k) { // 1-kmer bwtintv_t iv = ok[3 - k]; child_hits += iv.x[2]; if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点 Node* child = (Node*)calloc(1, sizeof(Node)); child->seq[child->seq_len++] = k; // 递归构建子节点 build_hyb_tree(pk, child, &iv, depth + 1, max_depth - 1); // 创建完子节点后,更新一些数据 update_parent(n, child, k, 1); // 更新父节点信息 } } // 子节点hits之和应该和父节点hits数量相等,否则,就是当前父节点有一个匹配到了ref的结尾 if (child_hits != (int)n->hits) { n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 } } // 将ptr的子节点赋值给n,并释放ptr节点 static void assign_children(Node* n, Node* ptr) { clear_parent(n); n->seq[n->seq_len++] = ptr->seq[0]; // 保存序列 uint32_t i = 0; for (; i < ptr->child_num; ++i) { update_parent(n, ptr->children[i], ptr->children[i]->seq[0], 1); } free(ptr); // 释放ptr节点 } // 处理path node static void handle_path_node(Node** n) { Node* cur = *n; Node* ptr = cur->children[0]; // 只有一个子节点 while (ptr->child_num == 1 && cur->hits == ptr->hits) { // 继续向下遍历,直到没有子节点, 或父子hits不一致 cur->seq[cur->seq_len++] = ptr->seq[0]; // 保存序列 Node* next = ptr->children[0]; // 获取下一个子节点 free(ptr); // 释放当前节点 ptr = next; } if (ptr->child_num == 1) { // hits不一致 clear_parent(cur); update_parent(cur, ptr, ptr->seq[0], 1); cur->hits_neq = 1; // 继承子节点的hits_neq ptr->seq[ptr->seq_len++] = ptr->seq[0]; // 多保存一次该节点的碱基,因为他也是path节点 *n = ptr; handle_path_node(n); // 递归处理下一个path节点 } else { assign_children(cur, ptr); // 将ptr的子节点赋值给n } } // dfs遍历当前节点的子节点,扩展到2或3层,然后去掉中间层节点,将后续节点直接与父节点连接, layer = 2 or 3 static void shrink_node_layer(PartKmer* pk, Node* n, int layer) { kvec_t(MarkNode) layer_nodes_stack; kv_init(layer_nodes_stack); // dfs遍历的stack kvec_t(MarkNode) kmer_stack; kv_init(kmer_stack); // 装所有的kmer kvec_t(Node*) to_release; kv_init(to_release); // 装所有的待删除的node int i; // init stack for (i = (int)n->child_num - 1; i >= 0; --i) { MarkNode ln; ln.node = n->children[i]; ln.mark = 1; // 第一层子节点 kv_push(MarkNode, layer_nodes_stack, ln); } // dfs 遍历 while (kv_size(layer_nodes_stack) > 0) { MarkNode ln = kv_pop(layer_nodes_stack); if ((int)ln.mark == layer) { // 是第3层的节点,创建对应的kmer uint8_t k = 0; if (layer == 2) k = ln.node->parent->seq[0] << 2 | ln.node->seq[0]; else if (layer == 3) k = ln.node->parent->parent->seq[0] << 4 | ln.node->parent->seq[0] << 2 | ln.node->seq[0]; MarkNode kmer_node; kmer_node.node = ln.node; kmer_node.mark = k; // 保存kmer kv_push(MarkNode, kmer_stack, kmer_node); // 保存kmer } else { kv_push(Node*, to_release, ln.node); // 保存当前节点,待释放 for (i = (int)ln.node->child_num - 1; i >= 0; --i) { MarkNode lnn; lnn.node = ln.node->children[i]; lnn.mark = ln.mark + 1; kv_push(MarkNode, layer_nodes_stack, lnn); } } } // 处理所有kmer clear_parent(n); int child_hits = 0; for (i = 0; i < (int)kv_size(kmer_stack); ++i) { MarkNode kmer_node = kv_A(kmer_stack, i); child_hits += kmer_node.node->hits; // 统计子节点的hits数量 update_parent(n, kmer_node.node, kmer_node.mark, 1); // 更新父节点信息 } if (child_hits != (int)n->hits) { n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 } for (i = 0; i < (int)kv_size(to_release); ++i) { free(kv_A(to_release, i)); } kv_destroy(layer_nodes_stack); kv_destroy(kmer_stack); kv_destroy(to_release); } // 构建完hybrid tree之后,从根节点开始遍历,设置节点的类型,构建path节点,3,2,1 kmer节点 static void update_hyb_tree(PartKmer* pk, Node* n) { if (n->child_num == 1 && !n->normal_leaf && !n->max_depth_leaf && !(n->max_child_depth <= 3)) { // path类型 handle_path_node(&n); } uint32_t i = 0; #define __recursive_update_hyb_tree(pk, n) \ for (i = 0; i < n->child_num; ++i) { \ update_hyb_tree(pk, n->children[i]); \ } // 计算子节点的idata地址,和父节点的match_cnt // 递归终点 if (n->max_child_depth == 5) { if (n->hits_neq) { n->type = HYB_BP_1; } else { n->type = HYB_BP_2; shrink_node_layer(pk, n, 2); } __recursive_update_hyb_tree(pk, n); } else if (n->max_child_depth == 4) { n->type = HYB_BP_1; __recursive_update_hyb_tree(pk, n); } else if (n->max_child_depth == 3) { if (n->hits_neq) { n->type = HYB_BP_1; } else if (n->hits_neq1) { n->type = HYB_BP_2; shrink_node_layer(pk, n, 2); } else { n->type = HYB_BP_3; shrink_node_layer(pk, n, 3); } } else if (n->max_child_depth == 2) { if (n->hits_neq) { n->type = HYB_BP_1; } else { n->type = HYB_BP_2; shrink_node_layer(pk, n, 2); } } else if (n->max_child_depth == 1) { n->type = HYB_BP_1; } else { if (n->hits_neq) { n->type = HYB_BP_1; } else if (n->hits_neq1) { n->type = HYB_BP_2; shrink_node_layer(pk, n, 2); } else { n->type = HYB_BP_3; shrink_node_layer(pk, n, 3); } __recursive_update_hyb_tree(pk, n); } } // 根据构建的radix tree,创建hyb-index static void calc_node_bytes(Node* n) { // 递归终点 if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点 n->node_bytes = 0; return; } // 先处理子节点 uint32_t i = 0; for (; i < n->child_num; ++i) { calc_node_bytes(n->children[i]); // 递归构建子节点 n->child_bytes += n->children[i]->node_bytes; // 累加子节点所占内存 } // 处理当前节点 n->node_bytes = n->child_bytes; if (n->depth == 0) n->node_bytes += 5; // sa_pos占用5字节 // 计算偏移地址字节 int off_bytes = 0; if (n->child_bytes == 0) { // 没有子节点 off_bytes = 0; } else if (n->child_bytes < 256) { off_bytes = 1; // 子节点数据偏移量小于256, 1B就行 } else if (n->child_bytes < 65536) { off_bytes = 2; // 2B就行 } else if (n->child_bytes < 16777216) { off_bytes = 3; // 3B就行 } else if (n->child_bytes < UINT32_MAX) { off_bytes = 4; // 使用4B } else { fprintf(stderr, "too many child bytes: %d\n", n->child_bytes); exit(1); } n->off_bytes = off_bytes; // 保存子节点数据偏移量字节数 // 计算hits字节 int hits_bytes = 0; if (n->hits < 256) { hits_bytes = 1; // hits小于256, 占用1字节 } else if (n->hits < 65536) { hits_bytes = 2; // hits小于65536, 占用2字节 } else if (n->hits < 16777216) { hits_bytes = 3; // hits小于16777216, 占用3字节 } else if (n->child_bytes < UINT32_MAX) { hits_bytes = 4; // 使用4B } else { fprintf(stderr, "too many hits: %d\n", n->hits); exit(1); } n->hit_bytes = hits_bytes; // 保存hits字节数 if (n->depth == 0 && n->hits > HYB_HIT_THRESH) { n->node_bytes += hits_bytes; // 根节点需要保存hits } if (n->seq_len > 1) { // path节点 n->node_bytes += 1; // header, path节点相当于两个节点,一个path,一个普通的 n->node_bytes += 1; // 结合header 记录碱基path长度,代替mark n->node_bytes += ((n->seq_len - 1) * 2 + 7) / 8; // 只有根节点保存本节点的碱基,其他的节点不保存seq[0],因为seq[0]是子节点的碱基 } // 有可能一个path下边还是一个path,因为可能父子hits不一致 if (n->child_num > 1 || n->seq_len == 1) { // 有多个子节点,或者当前节点不是path节点 n->node_bytes += 1; // header_bits占用1字节 // 叶节点只用bit-map来计算hits偏移就行了, 不用保存hits start信息 int all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1); if (!all_child_is_leaf) { n->node_bytes += (hits_bytes + off_bytes) * (n->child_num - 1); // 不保存第一个子节点offset,因为是0 // for test,首个节点不保存hits start和offset,因为hits start是0或者1,offset是0,这样index-data减少了1GB,总共20GB // 这样不能根据概率调整内存位置了 } if (n->type == HYB_BP_1) { // 1-kmer节点 n->node_bytes += 1; // 1字节的mark } else if (n->type == HYB_BP_2) { // 2-kmer节点 n->node_bytes += 2; // 2字节的mark } else if (n->type == HYB_BP_3) { // 3-kmer节点 n->node_bytes += 8; // 8字节的mark } else { fprintf(stderr, "Unknown node type: %d\n", (int)n->type); exit(1); } } } // 释放所有子节点和当前节点 static void release_node(Node* p) { uint32_t k = 0; for (k = 0; k < p->child_num; k++) { release_node(p->children[k]); } free(p); } void write_seq_bits(uint8_t* bs, int seq_len, BlockData* idata) { if (seq_len > 0) { uint32_t num_bytes = ((seq_len << 1) + 7) >> 3; uint8_t* bit_seq = (uint8_t*)_bd_addr(*idata); uint8_t bybp = 0; int i = 0, j = 0, idx = 0; for (; i + 3 < seq_len; i += 4) { bybp = bs[i] | bs[i + 1] << 2 | bs[i + 2] << 4 | bs[i + 3] << 6; *(uint8_t*)&bit_seq[idx++] = bybp; } bybp = 0; for (; i < seq_len; ++i, ++j) { bybp |= bs[i] << j * 2; } *(uint8_t*)&bit_seq[idx++] = bybp; idata->size += num_bytes; // 更新数据大小 } } static void write_normal_node_data(Node* n, BlockData* idata, uint8_t write_sa) { uint8_t* ptr; int i, j; int all_child_is_leaf = 0; // 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数; uint8_t header = n->type << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3; // 叶节点只用bit-map来计算hits偏移就行了 all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1); // 3bit子节点地址偏移字节数 if (all_child_is_leaf) { header |= HYB_LEAF_NODE; // 用off_bytes = 7来表示叶节点,不用读取hits和offset信息 } else { header |= n->off_bytes; // assert(n->off_bytes != 0); // 这里会出错,为啥有这个断言? } // header idata->addr[idata->size++] = header; // 写入header // sa_pos if (n->depth == 0 && write_sa) { ptr = (uint8_t*)&n->sa_pos; for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入sa_pos } // hits if (write_sa && n->depth == 0 && n->hits > HYB_HIT_THRESH) { ptr = (uint8_t*)&n->hits; for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入hits } // 处理mark if (n->type == HYB_BP_1) { idata->addr[idata->size++] = (uint8_t)(n->mark & 0xFF); // 1-kmer节点 } else if (n->type == HYB_BP_2) { ptr = (uint8_t*)&n->mark; for (i = 0; i < 2; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 2-kmer节点 } else if (n->type == HYB_BP_3) { ptr = (uint8_t*)&n->mark; for (i = 0; i < 8; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 3-kmer节点 } if (all_child_is_leaf) { return; // 叶节点只用bit-map来计算hits start就行了 } // 处理子节点 uint32_t sa_start = 0; uint32_t offset = 0; // 子节点数据偏移量 // 剩下的节点 for (j = 1; j < (int)n->child_num; ++j) { sa_start += n->children[j - 1]->hits; // 累加前一个子节点的hits ptr = (uint8_t*)&sa_start; for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入sa_start if (n->children[j]->node_bytes == 0 && n->children[j]->hits > 1 && !n->children[j]->max_depth_leaf) { fprintf(stderr, "Error: no child! hits:%u, depth:%u, max-leaf:%u\n", n->children[j]->hits, n->children[j]->depth, n->children[j]->max_depth_leaf); } offset += n->children[j - 1]->node_bytes; ptr = (uint8_t*)&offset; for (i = 0; i < n->off_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入offset } } // 写入path node数据,一个path node包含path和normal两种数据 static void write_path_node_data(Node* n, BlockData* idata) { // 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数; uint8_t header = HYB_BP_PATH << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3; int path_len = n->seq_len - 1; // path节点的长度 header |= path_len >> 8; // 此时,header最低位保存path_len的高8位(9bit足以表示path的长度) idata->addr[idata->size++] = header; // 写入header int i; uint8_t* ptr; if (n->depth == 0) { ptr = (uint8_t*)&n->sa_pos; for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入sa_pos } // 处理hits if (n->depth == 0 && n->hits > HYB_HIT_THRESH) { ptr = (uint8_t*)&n->hits; for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); idata->size += i; // 写入hits } // 处理mark idata->addr[idata->size++] = (uint8_t)(path_len & 0xFF); // 写入mark // 写入seq write_seq_bits(n->seq + 1, path_len, idata); // 跳过seq[0],因为seq[0]是子节点的碱基 // 继续把该节点当做正常节点处理 if (n->child_num > 1) write_normal_node_data(n, idata, 0); // 写入正常节点数据 } /* header-1B: 2bit子节点类型,1bit父子hits是否一致,2bit hits字节,3bit子节点数据偏移量字节, */ static void build_index_from_node(Node* n, BlockData* idata) { if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点 return; } uint32_t i = 0; _bd_claim(*idata, n->node_bytes); // 确保有足够的内存空间 // 先处理当前节点 // 1. 写入header // 2. 如果是根节点,写入sa_pos // 3. 写入hits // 4. 写入mark // 5. 写入子节点sa_pos偏移量和子节点数据偏移量 uint32_t node_self_size = idata->size; if (n->seq_len > 1) { write_path_node_data(n, idata); // 写入path节点数据 } else { write_normal_node_data(n, idata, 1); // 写入正常节点数据 } node_self_size = idata->size - node_self_size; // 当前节点数据大小 if (node_self_size != n->node_bytes - n->child_bytes) { fprintf(stderr, "Error: node bytes not match: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n", node_self_size, n->node_bytes - n->child_bytes, n->node_bytes, n->depth, n->max_child_depth, n->seq_len, n->hits, n->child_num, n->hits_neq, (int)n->type); Node* p = n; fprintf(stderr, "Info: father neq: %u, %u, %u\n", p->parent->hits_neq, p->parent->child_num, p->parent->hits); while (p != NULL) { fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); p = p->children[0]; // 只打印第一个子节点 } p = n; while (p->depth > 0) { fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); p = p->parent; // 向上打印父节点 } fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); exit(0); } for (i = 0; i < n->child_num; ++i) { // 这里需要按照匹配概率来修改 build_index_from_node(n->children[i], idata); // 递归构建子节点 } } // 多线程创建hybrid index static void* thread_build_hyb_index(void* arg) { PartKmer* p = (PartKmer*)arg; uint8_t kmer[HYB_KMER_LEN + 1]; uint64_t all_kmer_num = p->kmer_end - p->kmer_start; uint64_t percent_mod = all_kmer_num / 100; int percent = 0; if (percent_mod == 0) percent_mod = 1; uint8_t type_hits = 0; uint64_t ref_pos = 0; uint64_t sa_forward = 0; uint64_t idata_offset = 0; uint64_t packed_data = 0; uint8_t* ptr = (uint8_t*)&packed_data; uint64_t i, j, kmer_size; fprintf(stderr, "[thread %d] start building hybrid index for kmer range [%lu, %lu)\n", p->tid, p->kmer_start, p->kmer_end); for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) { uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_BYTES; for (j = 0; j < HYB_KMER_LEN; ++j) { kmer[j] = (i >> (j << 1)) & 3; } bwtintv_t iv = bwt_search_kmer(p->bwt, kmer); const uint32_t khits = iv.x[2]; // 有多少个匹配的位置 if (khits == 0) { // no extra bytes packed_data = 0; for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); } else if (khits == 1) { // 只有一个匹配,直接保存对应的参考基因位置 // no extra bytes type_hits = 1; ref_pos = bwt_get_sa(p->bwt->byte_sa, iv.x[0]); // 直接记录kmer对应的reference pos packed_data = (ref_pos << HYB_KMER_DATA_TYPE_BITS) | type_hits; // 这里需要注意,packed_data是小端模式的, 5个字节 for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); } else if (khits == 2) { // 只有两个匹配,直接保存对应的suffix array位置 // no extra bytes type_hits = 2; sa_forward = iv.x[0]; // 记录kmer对应的后缀数组的位置 packed_data = (sa_forward << HYB_KMER_DATA_TYPE_BITS) | type_hits; for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); } else { Node* root = (Node*)calloc(1, sizeof(Node)); root->seq[root->seq_len++] = kmer[HYB_KMER_LEN - 1]; build_hyb_tree(p, root, &iv, 0, HYB_MAX_SEQ_LEN - HYB_KMER_LEN); // 创建基数树 update_hyb_tree(p, root); // 更新hybrid tree idata_offset = p->idata.size; // 指向当前要用的地址偏移量(起始位置) calc_node_bytes(root); // 计算每个节点所占内存 build_index_from_node(root, &p->idata); // 构建hybrid index // 将type和地址偏移信息写入kmer if (khits <= HYB_HIT_THRESH) { // hits数量小于等于30 type_hits = khits; // hits数量 } else { type_hits = HYB_HIT_THRESH + 1; } packed_data = (idata_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits; for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); p->bytes += root->node_bytes; // 累加当前节点所占内存 release_node(root); // 释放所有子节点和当前节点 } if (p->tid == 0 && kmer_size % percent_mod == 0) { fprintf(stderr, "[thread %d] %d %% done, bytes: %fM\n", p->tid, percent++, (double)p->bytes / 1024 / 1024); } } fprintf(stderr, "Finish-[thread %d] bytes: %fM\n", p->tid, (double)p->bytes / 1024 / 1024); return NULL; } // 多线程调整hybrid index中的block data偏移地址,整合成连续的一块地址空间 static void* thread_adjust_hyb_index(void* arg) { PartKmer* p = (PartKmer*)arg; uint64_t all_kmer_num = p->kmer_end - p->kmer_start; uint64_t percent_mod = all_kmer_num / 100; int percent = 0; if (percent_mod == 0) percent_mod = 1; uint8_t type_hits = 0; uint64_t bd_offset = 0; uint64_t packed_data = 0; uint8_t* ptr = (uint8_t*)&packed_data; uint64_t i, kmer_size; for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) { uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_TYPE_BITS; type_hits = *(kd)&HYB_KMER_DATA_TYPE_MASK; if (type_hits > 2) { // block data index数据,需要改offset // bd_offset = ((*(uint64_t *)kd) >> kTypeBits) & p->KMER_MASK; // 指向当前要用的地址偏移量 bd_offset = ((*(uint64_t*)kd) & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; // 指向当前要用的地址偏移量 bd_offset += p->prev_bd_size; packed_data = (bd_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits; for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); } if (p->tid == 0 && kmer_size % percent_mod == 0) { fprintf(stderr, "Info: %d%% done adjust.\n", percent++); } } return NULL; } // 保存hybrid index static void save_hybrid_idx(const char* prefix, PartKmerVec* pkArr) { // fprintf(stderr, "size: %ld\n", kv_size(*pkArr)); // return; char fn[MAX_PATH]; char fn2[MAX_PATH]; sprintf(fn, "%s.hyb.kmer", prefix); sprintf(fn2, "%s.hyb.data", prefix); FILE* fpk = fopen(fn, "wb"); FILE* fpbd = fopen(fn2, "wb"); if (fpk == NULL) { fprintf(stderr, "Error: open file %s failed\n", fn); return; } if (fpbd == NULL) { fprintf(stderr, "Error: open file %s failed", fn2); return; } uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1)); err_fwrite(kv_A(*pkArr, 0).kmer_data, 1, max_kmer * HYB_KMER_DATA_BYTES, fpk); // 小端 int i; for (i = 0; i < kv_size(*pkArr); ++i) { err_fwrite(kv_A(*pkArr, i).idata.addr, 1, kv_A(*pkArr, i).idata.size, fpbd); } fclose(fpk); fclose(fpbd); } // 构建hybrid-index入口函数 int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix) { PartKmerVec pk_arr; kv_init(pk_arr); kv_resize(PartKmer, pk_arr, num_threads); pk_arr.n = num_threads; kvec_t(pthread_t) tid_arr; kv_init(tid_arr); kv_resize(pthread_t, tid_arr, num_threads); tid_arr.n = num_threads; uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1)); uint8_t* kmer_data = (uint8_t*)malloc(max_kmer * HYB_KMER_DATA_BYTES); int i; for (i = 0; i < num_threads; ++i) { PartKmer* p = &kv_A(pk_arr, i); p->tid = i; p->kmer_start = KMER_START(i, max_kmer, num_threads); p->kmer_end = KMER_END(i, max_kmer, num_threads); p->bytes = 0; p->kmer_data = kmer_data; p->prev_bd_size = 0; _bd_init(p->idata); p->bwt = bwt; pthread_create(&kv_A(tid_arr, i), NULL, thread_build_hyb_index, p); } uint64_t all_bytes = 0; uint64_t prev_bd_size = 0; for (i = 0; i < num_threads; ++i) { pthread_join(kv_A(tid_arr, i), NULL); PartKmer* p = &kv_A(pk_arr, i); all_bytes += p->bytes; p->prev_bd_size = prev_bd_size; prev_bd_size += p->idata.size; } for (i = 0; i < num_threads; ++i) { pthread_create(&kv_A(tid_arr, i), NULL, thread_adjust_hyb_index, &kv_A(pk_arr, i)); } for (int i = 0; i < num_threads; ++i) { pthread_join(kv_A(tid_arr, i), NULL); } fprintf(stderr, "Hybrid index total bytes: %fM\n", (double)all_bytes / 1024 / 1024); double time_sava_hybrid_idx = realtime(); save_hybrid_idx(idx_prefix, &pk_arr); time_sava_hybrid_idx = realtime() - time_sava_hybrid_idx; fprintf(stderr, "Time save hybrid index: %fs\n", time_sava_hybrid_idx); kv_destroy(pk_arr); kv_destroy(tid_arr); return 0; }