diff --git a/.vscode/launch.json b/.vscode/launch.json index d4d7bc3..834cd67 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -92,13 +92,12 @@ "preLaunchTask": "Build", "type": "cppdbg", "request": "launch", - "program": "${workspaceRoot}/hbwa", + "program": "${workspaceRoot}/hybalign", "args": [ - "bwt2hybrid", - "-e", + "bwt2hyb", "-t", "1", - "~/data/fmt_ref/human_g1k_v37_decoy.fasta" + "./index/human_g1k_v37_decoy.fasta" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, diff --git a/bwamem.c b/bwamem.c index 7909c79..bf1f925 100644 --- a/bwamem.c +++ b/bwamem.c @@ -106,7 +106,12 @@ mem_opt_t *mem_opt_init() o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); bwa_fill_scmat(o->a, o->b, o->mat); - return o; + + o->use_bwt = 0; + o->skip_entire_match = 0; + o->batch_size = 256; + + return o; } /*************************** diff --git a/bwamem.h b/bwamem.h index a518aaf..6451640 100644 --- a/bwamem.h +++ b/bwamem.h @@ -81,6 +81,9 @@ typedef struct { int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int batch_size; // number of bases to process in each batch + int use_bwt; // whether to use bwt index for seeding + int skip_entire_match; // whether to skip the second and third seeding steps for entire matching reads } mem_opt_t; typedef struct { diff --git a/fastmap.c b/fastmap.c index be7ba0e..705d3dd 100644 --- a/fastmap.c +++ b/fastmap.c @@ -150,13 +150,13 @@ int main_mem(int argc, char *argv[]) mem_pestat_t pes[4]; ktp_aux_t aux; - memset(&aux, 0, sizeof(ktp_aux_t)); + memset(&aux, 0, sizeof(ktp_aux_t)); memset(pes, 0, 4 * sizeof(mem_pestat_t)); for (i = 0; i < 4; ++i) pes[i].failed = 1; aux.opt = opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) { + while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:we")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == '1') no_mt_io = 1; else if (c == 'x') mode = optarg; @@ -254,8 +254,12 @@ int main_mem(int argc, char *argv[]) if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); - } - else return 1; + } else if (c == 'b') + opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256; + else if (c == 'w') + opt->use_bwt = 1; + else if (c == 'e') opt->skip_entire_match = 1; + else return 1; } if (rg_line) { @@ -320,7 +324,10 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n"); - fprintf(stderr, "\n"); + fprintf(stderr, " -b INT batch size of reads to process at one time [%d].\n", opt->batch_size); + fprintf(stderr, " -w Use bwt index for seeding\n"); + fprintf(stderr, " -e Skip the second and third seeding steps for entire matching reads.\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); diff --git a/hyb_bwa.c b/hyb_bwa.c index 028933f..4f4c3a1 100644 --- a/hyb_bwa.c +++ b/hyb_bwa.c @@ -117,7 +117,7 @@ int bwa_extract_byte_sa(int argc, char* argv[]) { } } if (optind + 3 > argc) { - fprintf(stderr, "Usage: bwa extractbytesa [-i %d] \n", sa_intv); + fprintf(stderr, "Usage: bwa extractbytesa [-i %d] \n\n", sa_intv); return 1; } bwt = bwt_restore_bwt(argv[optind]); @@ -206,25 +206,56 @@ void bwt_create_kmer_idx(bwt_t* bwt) { free(seq_arr); } +// 从bwt中创建kmer index,并保存到文件 int bwa_bwt2kmer(int argc, char* argv[]) { bwt_t* bwt; if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa extractbytesa \n"); + fprintf(stderr, "Usage: bwa extractbytesa \n\n"); return 1; } bwt = bwt_restore_bwt(argv[optind]); - bwt_create_kmer_idx(bwt); - bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash); - return 0; } -int bwa_bwt2hyb(int argc, char* argv[]) {} +// 创建hybrid index,并保存到文件 +int bwa_bwt2hyb(int argc, char* argv[]) { + int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix); + bwt_t* bwt; + int num_threads = 1; + char c; + int error = 0; + while ((c = getopt(argc, argv, "t:")) >= 0) { + switch (c) { + case 't': + num_threads = atoi(optarg); + assert(num_threads > 0 && num_threads < 512); + break; + default: + error = 1; + break; + } + } + if (optind + 1 > argc || error) { + fprintf(stderr, "Usage: bwa bwt2hyb [Options] \n\n"); + fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads); + fprintf(stderr, "\n"); + return 1; + } + char fn[MAX_PATH]; + snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]); + bwt = bwt_restore_bwt(fn); + snprintf(fn, MAX_PATH, "%s.kmer", argv[optind]); + bwt->kmer_hash = bwt_restore_kmer_idx(fn); + snprintf(fn, MAX_PATH, "%s.hyb.bytesa", argv[optind]); + _load_file_to_data(fn, bwt->byte_sa); + hyb_idx_build_and_dump(num_threads, bwt, argv[optind]); + return 0; +} // 创建正向的kmer -inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) { +uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) { uint64_t qbit = 0, i; qlen = qlen < kmer_len ? qlen : kmer_len; for (i = 0; i < qlen; ++i) { @@ -239,7 +270,7 @@ inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int int hyb_test(int argc, char* argv[]) { bwt_t* bwt; if (optind + 1 > argc) { - fprintf(stderr, "Usage: bwa hybtest \n"); + fprintf(stderr, "Usage: bwa hybtest \n\n"); return 1; } diff --git a/hyb_create_idx.c b/hyb_create_idx.c index e69de29..0f3709b 100644 --- a/hyb_create_idx.c +++ b/hyb_create_idx.c @@ -0,0 +1,866 @@ +/* The MIT License + + Copyright (c) 2025- Zhonghai Zhang (ICT) + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bwt.h" +#include "hyb_idx.h" +#include "kvec.h" +#include "rle.h" +#include "rope.h" +#include "utils.h" + +#define KMER_START(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)(tid) / (uint64_t)(num_thread)) +#define KMER_END(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)((tid) + 1) / (uint64_t)(num_thread)) + +/////////////////////////// + +typedef struct { + uint64_t size; + uint64_t cap; + uint8_t* addr; +} BlockData; + +#define _bd_init(bd) \ + do { \ + (bd).size = 0; \ + (bd).cap = 0; \ + (bd).addr = 0; \ + } while (0) + +#define _bd_claim(bd, add_size) \ + do { \ + if ((bd).size + add_size >= (bd).cap) { \ + (bd).cap += add_size + 4L * 1024 * 1024; \ + (bd).addr = (uint8_t*)realloc((bd).addr, (bd).cap); \ + } \ + } while (0) + +#define _bd_addr(bd) ((bd).addr + (bd).size) + +#define _bd_destory(bd) \ + do { \ + free((bd).addr); \ + } while (0) + +typedef struct { + uint32_t idx; + uint32_t cnt; + uint32_t bytes; +} idx_cnt_t; + +// 构建hyb-index时候用到的节点数据结构 +typedef struct _Node { + uint8_t header; // 1字节头部信息 [0] + uint8_t hit_bytes; // 用多少bytes表示hits + uint8_t off_bytes; // 用多少bytes表示子节点数据偏移量 + + uint8_t hits_neq; // 和子节点hits数量不相等 + uint8_t hits_neq1; // 子节点和孙子节点hits数量不相等 + uint8_t max_depth_leaf; // 是否达到最大深度的叶节点 + uint8_t normal_leaf; // 正常的叶子节点,hits=1 + + uint16_t seq_len; // 单一分支碱基序列长度,如果分支,那就是1 + uint16_t depth; // 当前节点对应的位置,0对应着16(不包含16-kmer) + uint16_t min_child_depth; // 最近到hit=1的深度 + uint16_t max_child_depth; // 最远到hit=1或者到达最大深度时的深度 + + uint32_t hits; // 匹配个数 + uint32_t child_num; // 有多少分支子节点, 第一层子节点个数 + uint32_t num_offspring; // 所有后代节点个数 + uint32_t min_offspring_hits; // 所有后代中,最后最少的hits + uint32_t node_bytes; // 该节点包含所有后代所占内存 + uint32_t child_bytes; // 子节点所占内存 + + uint8_t type; // 当前节点类型 + uint64_t mark; // 与type对应,4bit,16bit,64bit + uint64_t sa_pos; // 对应的sa起始位置 + uint8_t seq[320]; // 保存所有单一分支的碱基序列 + + uint8_t kmer; // 当前节点对应的kmer,root节点是uint64,用这个不准确 + + struct _Node* children[64]; // 子节点 + struct _Node* parent; // 父节点 +} Node; + +// 并行构建hyb-index的数据结构 +typedef struct { + pthread_t ptid; // 线程ID + int tid; // 线程id + uint64_t kmer_start; // [) 左闭右开 + uint64_t kmer_end; + uint64_t bytes; // 当前线程所有kmer数据占用的内存字节数 + uint8_t* kmer_data; // kmer数据 + uint64_t prev_bd_size; // 前一个kmer对应的index数据所占内存大小,用来调整多线程kmer数据对应的地址偏移量的 + BlockData idata; // 二级索引数据 + bwt_t* bwt; // bwt索引 +} PartKmer; + +typedef kvec_t(PartKmer) PartKmerVec; + +// 当hit变成1后,继续扩展节点,为了保存2-kmer或者3-kmer对应的匹配信息 +typedef struct { + Node* node; + uint64_t mark; +} MarkNode; + +///////////////// + +// 正向搜索kmer(hyb的16-kmer)在bwt中的区间信息 +bwtintv_t bwt_search_kmer(const bwt_t* bwt, const uint8_t* q) { + bwtintv_t ik = {0}, ok[4] = {0}; + int i, x, c; + uint32_t qbit = 0; + x = HASH_KMER_LEN; + for (i = 0; i < x; ++i) { + qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1); + } + bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, x - 1); + if (ik.x[2] < 1) + return ik; + for (i = x; i < HYB_KMER_LEN; ++i) { + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] < 1) + return ok[c]; + ik = ok[c]; + } + return ik; +} + +// 根据子节点更新父节点信息 +static void update_parent(Node* n, Node* c, uint64_t k, int update_max) { + // 创建完子节点后,更新一些数据 + n->children[n->child_num++] = c; + c->kmer = k; + c->parent = n; + n->mark |= (1ULL << k); // 设置当前节点的mark + n->min_offspring_hits = MIN(n->min_offspring_hits, c->min_offspring_hits); + n->min_child_depth = MIN(n->min_child_depth, c->min_child_depth + 1); + if (update_max) + n->max_child_depth = MAX(n->max_child_depth, c->max_child_depth + 1); + n->num_offspring += c->num_offspring + 1; + if (c->hits_neq) + n->hits_neq1 = 1; // 子节点和孙子节点hits数量不相等 +} + +// 清除父节点的一些信息, 不释放内存空间 +static void clear_parent(Node* n) { + uint32_t i = 0; + n->max_child_depth = 0; + for (; i < n->child_num; ++i) n->children[i] = NULL; // 清除子节点 + n->child_num = 0; + n->mark = 0; + n->num_offspring = 0; + n->hits_neq1 = 0; +} + +// 更新节点信息 +static void update_node_info(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth, int update_min) { + n->depth = depth; // 当前节点对应的深度 + n->min_offspring_hits = ik->x[2]; // 所有子节点中最小的hits数量 + n->sa_pos = ik->x[0]; // bwt_get_sa(pk->fmt->sa, ik->x[0]); // 对应的sa中第一个位置 + n->hits = ik->x[2]; // 匹配数量 + if (update_min) + n->min_child_depth = max_depth; // 初始化为当前节点所能达到的最大深度(人为设定的阈值) +} + +// 普通的leaf节点扩展,扩展到第2层和第3层 +static void extend_leaf_node(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) { + n->normal_leaf = 1; + update_node_info(pk, n, ik, depth, max_depth, 0); + if (max_depth == 0) { + return; + } + uint8_t k; + int child_hits = 0; + bwtintv_t ok[4] = {0}; + bwt_extend(pk->bwt, ik, ok, 0); + for (k = 0; k < 4; ++k) { + bwtintv_t iv = ok[3 - k]; + child_hits += iv.x[2]; + if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点 + Node* child = (Node*)calloc(1, sizeof(Node)); + child->seq[child->seq_len++] = k; // 保存序列 + // 递归构建子节点 + extend_leaf_node(pk, child, &iv, depth + 1, max_depth - 1); + // 创建完子节点后,更新一些数据 + update_parent(n, child, k, 0); // 更新父节点信息 + break; + } + } + if (child_hits != (int)n->hits) { + n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 + } +} + +// 递归构建hyb-tree +// n: 父节点 +// depth: kmer之后该碱基在第几个(位置/深度) +// max_depth: 遍历到最大深度停止(最长有多少碱基, 从当前节点算起) +static void build_hyb_tree(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) { + /* 递归终止条件 */ + if (ik->x[2] == 1) { // 如果hits为0或者1,则不再需要子节点 + n->min_child_depth = 0; + n->normal_leaf = 1; // 正常的叶子节点,hits=1 + extend_leaf_node(pk, n, ik, depth, 2); // 继续扩展两个碱基 + return; + } + // 给节点n赋值 + update_node_info(pk, n, ik, depth, max_depth, 1); + if (max_depth == 0) { // 达到最大深度,不再进行扩展 + n->min_child_depth = 0; + n->max_depth_leaf = 1; // 达到最大深度的叶节点 + return; + } + int child_hits = 0; // 所有子节点hits数量之和 + uint8_t k; // 碱基 + bwtintv_t ok[4] = {0}; + bwt_extend(pk->bwt, ik, ok, 0); + for (k = 0; k < 4; ++k) { // 1-kmer + bwtintv_t iv = ok[3 - k]; + child_hits += iv.x[2]; + if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点 + Node* child = (Node*)calloc(1, sizeof(Node)); + child->seq[child->seq_len++] = k; + // 递归构建子节点 + build_hyb_tree(pk, child, &iv, depth + 1, max_depth - 1); + // 创建完子节点后,更新一些数据 + update_parent(n, child, k, 1); // 更新父节点信息 + } + } + // 子节点hits之和应该和父节点hits数量相等,否则,就是当前父节点有一个匹配到了ref的结尾 + if (child_hits != (int)n->hits) { + n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 + } +} + +// 将ptr的子节点赋值给n,并释放ptr节点 +static void assign_children(Node* n, Node* ptr) { + clear_parent(n); + n->seq[n->seq_len++] = ptr->seq[0]; // 保存序列 + uint32_t i = 0; + for (; i < ptr->child_num; ++i) { + update_parent(n, ptr->children[i], ptr->children[i]->seq[0], 1); + } + free(ptr); // 释放ptr节点 +} + +// 处理path node +static void handle_path_node(Node** n) { + Node* cur = *n; + Node* ptr = cur->children[0]; // 只有一个子节点 + while (ptr->child_num == 1 && cur->hits == ptr->hits) { // 继续向下遍历,直到没有子节点, 或父子hits不一致 + cur->seq[cur->seq_len++] = ptr->seq[0]; // 保存序列 + Node* next = ptr->children[0]; // 获取下一个子节点 + free(ptr); // 释放当前节点 + ptr = next; + } + if (ptr->child_num == 1) { // hits不一致 + clear_parent(cur); + update_parent(cur, ptr, ptr->seq[0], 1); + cur->hits_neq = 1; // 继承子节点的hits_neq + ptr->seq[ptr->seq_len++] = ptr->seq[0]; // 多保存一次该节点的碱基,因为他也是path节点 + *n = ptr; + handle_path_node(n); // 递归处理下一个path节点 + } else { + assign_children(cur, ptr); // 将ptr的子节点赋值给n + } +} + +// dfs遍历当前节点的子节点,扩展到2或3层,然后去掉中间层节点,将后续节点直接与父节点连接, layer = 2 or 3 +static void shrink_node_layer(PartKmer* pk, Node* n, int layer) { + kvec_t(MarkNode) layer_nodes_stack; + kv_init(layer_nodes_stack); // dfs遍历的stack + kvec_t(MarkNode) kmer_stack; + kv_init(kmer_stack); // 装所有的kmer + kvec_t(Node*) to_release; + kv_init(to_release); // 装所有的待删除的node + + int i; + // init stack + for (i = (int)n->child_num - 1; i >= 0; --i) { + MarkNode ln; + ln.node = n->children[i]; + ln.mark = 1; // 第一层子节点 + kv_push(MarkNode, layer_nodes_stack, ln); + } + // dfs 遍历 + while (kv_size(layer_nodes_stack) > 0) { + MarkNode ln = kv_pop(layer_nodes_stack); + if ((int)ln.mark == layer) { // 是第3层的节点,创建对应的kmer + uint8_t k = 0; + if (layer == 2) + k = ln.node->parent->seq[0] << 2 | ln.node->seq[0]; + else if (layer == 3) + k = ln.node->parent->parent->seq[0] << 4 | ln.node->parent->seq[0] << 2 | ln.node->seq[0]; + MarkNode kmer_node; + kmer_node.node = ln.node; + kmer_node.mark = k; // 保存kmer + kv_push(MarkNode, kmer_stack, kmer_node); // 保存kmer + } else { + kv_push(Node*, to_release, ln.node); // 保存当前节点,待释放 + for (i = (int)ln.node->child_num - 1; i >= 0; --i) { + MarkNode lnn; + lnn.node = ln.node->children[i]; + lnn.mark = ln.mark + 1; + kv_push(MarkNode, layer_nodes_stack, lnn); + } + } + } + // 处理所有kmer + clear_parent(n); + int child_hits = 0; + for (i = 0; i < (int)kv_size(kmer_stack); ++i) { + MarkNode kmer_node = kv_A(kmer_stack, i); + child_hits += kmer_node.node->hits; // 统计子节点的hits数量 + update_parent(n, kmer_node.node, kmer_node.mark, 1); // 更新父节点信息 + } + if (child_hits != (int)n->hits) { + n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等 + } + for (i = 0; i < (int)kv_size(to_release); ++i) { + free(kv_A(to_release, i)); + } + kv_destroy(layer_nodes_stack); + kv_destroy(kmer_stack); + kv_destroy(to_release); +} + +// 构建完hybrid tree之后,从根节点开始遍历,设置节点的类型,构建path节点,3,2,1 kmer节点 +static void update_hyb_tree(PartKmer* pk, Node* n) { + if (n->child_num == 1 && !n->normal_leaf && !n->max_depth_leaf && !(n->max_child_depth <= 3)) { // path类型 + handle_path_node(&n); + } + uint32_t i = 0; +#define __recursive_update_hyb_tree(pk, n) \ + for (i = 0; i < n->child_num; ++i) { \ + update_hyb_tree(pk, n->children[i]); \ + } + // 计算子节点的idata地址,和父节点的match_cnt + // 递归终点 + if (n->max_child_depth == 5) { + if (n->hits_neq) { + n->type = HYB_BP_1; + } else { + n->type = HYB_BP_2; + shrink_node_layer(pk, n, 2); + } + __recursive_update_hyb_tree(pk, n); + } else if (n->max_child_depth == 4) { + n->type = HYB_BP_1; + __recursive_update_hyb_tree(pk, n); + } else if (n->max_child_depth == 3) { + if (n->hits_neq) { + n->type = HYB_BP_1; + } else if (n->hits_neq1) { + n->type = HYB_BP_2; + shrink_node_layer(pk, n, 2); + } else { + n->type = HYB_BP_3; + shrink_node_layer(pk, n, 3); + } + } else if (n->max_child_depth == 2) { + if (n->hits_neq) { + n->type = HYB_BP_1; + } else { + n->type = HYB_BP_2; + shrink_node_layer(pk, n, 2); + } + } else if (n->max_child_depth == 1) { + n->type = HYB_BP_1; + } else { + if (n->hits_neq) { + n->type = HYB_BP_1; + } else if (n->hits_neq1) { + n->type = HYB_BP_2; + shrink_node_layer(pk, n, 2); + } else { + n->type = HYB_BP_3; + shrink_node_layer(pk, n, 3); + } + __recursive_update_hyb_tree(pk, n); + } +} + +// 根据构建的radix tree,创建hyb-index +static void calc_node_bytes(Node* n) { + // 递归终点 + if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点 + n->node_bytes = 0; + return; + } + // 先处理子节点 + uint32_t i = 0; + for (; i < n->child_num; ++i) { + calc_node_bytes(n->children[i]); // 递归构建子节点 + n->child_bytes += n->children[i]->node_bytes; // 累加子节点所占内存 + } + // 处理当前节点 + n->node_bytes = n->child_bytes; + if (n->depth == 0) n->node_bytes += 5; // sa_pos占用5字节 + + // 计算偏移地址字节 + int off_bytes = 0; + if (n->child_bytes == 0) { // 没有子节点 + off_bytes = 0; + } else if (n->child_bytes < 256) { + off_bytes = 1; // 子节点数据偏移量小于256, 1B就行 + } else if (n->child_bytes < 65536) { + off_bytes = 2; // 2B就行 + } else if (n->child_bytes < 16777216) { + off_bytes = 3; // 3B就行 + } else if (n->child_bytes < UINT32_MAX) { + off_bytes = 4; // 使用4B + } else { + fprintf(stderr, "too many child bytes: %d\n", n->child_bytes); + exit(1); + } + n->off_bytes = off_bytes; // 保存子节点数据偏移量字节数 + + // 计算hits字节 + int hits_bytes = 0; + if (n->hits < 256) { + hits_bytes = 1; // hits小于256, 占用1字节 + } else if (n->hits < 65536) { + hits_bytes = 2; // hits小于65536, 占用2字节 + } else if (n->hits < 16777216) { + hits_bytes = 3; // hits小于16777216, 占用3字节 + } else if (n->child_bytes < UINT32_MAX) { + hits_bytes = 4; // 使用4B + } else { + fprintf(stderr, "too many hits: %d\n", n->hits); + exit(1); + } + n->hit_bytes = hits_bytes; // 保存hits字节数 + + if (n->depth == 0 && n->hits > HYB_HIT_THRESH) { + n->node_bytes += hits_bytes; // 根节点需要保存hits + } + + if (n->seq_len > 1) { // path节点 + n->node_bytes += 1; // header, path节点相当于两个节点,一个path,一个普通的 + n->node_bytes += 1; // 结合header 记录碱基path长度,代替mark + n->node_bytes += ((n->seq_len - 1) * 2 + 7) / 8; // 只有根节点保存本节点的碱基,其他的节点不保存seq[0],因为seq[0]是子节点的碱基 + } + // 有可能一个path下边还是一个path,因为可能父子hits不一致 + if (n->child_num > 1 || n->seq_len == 1) { // 有多个子节点,或者当前节点不是path节点 + n->node_bytes += 1; // header_bits占用1字节 + // 叶节点只用bit-map来计算hits偏移就行了, 不用保存hits start信息 + int all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1); + if (!all_child_is_leaf) { + n->node_bytes += (hits_bytes + off_bytes) * (n->child_num - 1); // 不保存第一个子节点offset,因为是0 + // for test,首个节点不保存hits start和offset,因为hits start是0或者1,offset是0,这样index-data减少了1GB,总共20GB + // 这样不能根据概率调整内存位置了 + } + if (n->type == HYB_BP_1) { // 1-kmer节点 + n->node_bytes += 1; // 1字节的mark + } else if (n->type == HYB_BP_2) { // 2-kmer节点 + n->node_bytes += 2; // 2字节的mark + } else if (n->type == HYB_BP_3) { // 3-kmer节点 + n->node_bytes += 8; // 8字节的mark + } else { + fprintf(stderr, "Unknown node type: %d\n", (int)n->type); + exit(1); + } + } +} + +// 释放所有子节点和当前节点 +static void release_node(Node* p) { + uint32_t k = 0; + for (k = 0; k < p->child_num; k++) { + release_node(p->children[k]); + } + free(p); +} + +void write_seq_bits(uint8_t* bs, int seq_len, BlockData* idata) { + if (seq_len > 0) { + uint32_t num_bytes = ((seq_len << 1) + 7) >> 3; + uint8_t* bit_seq = (uint8_t*)_bd_addr(*idata); + uint8_t bybp = 0; + int i = 0, j = 0, idx = 0; + for (; i + 3 < seq_len; i += 4) { + bybp = bs[i] | bs[i + 1] << 2 | bs[i + 2] << 4 | bs[i + 3] << 6; + *(uint8_t*)&bit_seq[idx++] = bybp; + } + bybp = 0; + for (; i < seq_len; ++i, ++j) { + bybp |= bs[i] << j * 2; + } + *(uint8_t*)&bit_seq[idx++] = bybp; + idata->size += num_bytes; // 更新数据大小 + } +} + +static void write_normal_node_data(Node* n, BlockData* idata, uint8_t write_sa) { + uint8_t* ptr; + int i, j; + int all_child_is_leaf = 0; + + // 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数; + uint8_t header = n->type << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3; + // 叶节点只用bit-map来计算hits偏移就行了 + all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1); + // 3bit子节点地址偏移字节数 + if (all_child_is_leaf) { + header |= HYB_LEAF_NODE; // 用off_bytes = 7来表示叶节点,不用读取hits和offset信息 + } else { + header |= n->off_bytes; +// assert(n->off_bytes != 0); // 这里会出错,为啥有这个断言? + } + + // header + idata->addr[idata->size++] = header; // 写入header + // sa_pos + if (n->depth == 0 && write_sa) { + ptr = (uint8_t*)&n->sa_pos; + for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入sa_pos + } + // hits + if (write_sa && n->depth == 0 && n->hits > HYB_HIT_THRESH) { + ptr = (uint8_t*)&n->hits; + for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入hits + } + + // 处理mark + if (n->type == HYB_BP_1) { + idata->addr[idata->size++] = (uint8_t)(n->mark & 0xFF); // 1-kmer节点 + } else if (n->type == HYB_BP_2) { + ptr = (uint8_t*)&n->mark; + for (i = 0; i < 2; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 2-kmer节点 + } else if (n->type == HYB_BP_3) { + ptr = (uint8_t*)&n->mark; + for (i = 0; i < 8; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 3-kmer节点 + } + + if (all_child_is_leaf) { + return; // 叶节点只用bit-map来计算hits start就行了 + } + + // 处理子节点 + uint32_t sa_start = 0; + uint32_t offset = 0; // 子节点数据偏移量 + // 剩下的节点 + for (j = 1; j < (int)n->child_num; ++j) { + sa_start += n->children[j - 1]->hits; // 累加前一个子节点的hits + ptr = (uint8_t*)&sa_start; + for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入sa_start + if (n->children[j]->node_bytes == 0 && n->children[j]->hits > 1 && !n->children[j]->max_depth_leaf) { + fprintf(stderr, "Error: no child! hits:%u, depth:%u, max-leaf:%u\n", n->children[j]->hits, n->children[j]->depth, + n->children[j]->max_depth_leaf); + } + offset += n->children[j - 1]->node_bytes; + ptr = (uint8_t*)&offset; + for (i = 0; i < n->off_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入offset + } +} + +// 写入path node数据,一个path node包含path和normal两种数据 +static void write_path_node_data(Node* n, BlockData* idata) { + // 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数; + uint8_t header = HYB_BP_PATH << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3; + int path_len = n->seq_len - 1; // path节点的长度 + header |= path_len >> 8; // 此时,header最低位保存path_len的高8位(9bit足以表示path的长度) + idata->addr[idata->size++] = header; // 写入header + int i; + uint8_t* ptr; + if (n->depth == 0) { + ptr = (uint8_t*)&n->sa_pos; + for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入sa_pos + } + // 处理hits + if (n->depth == 0 && n->hits > HYB_HIT_THRESH) { + ptr = (uint8_t*)&n->hits; + for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i); + idata->size += i; // 写入hits + } + // 处理mark + idata->addr[idata->size++] = (uint8_t)(path_len & 0xFF); // 写入mark + // 写入seq + write_seq_bits(n->seq + 1, path_len, idata); // 跳过seq[0],因为seq[0]是子节点的碱基 + // 继续把该节点当做正常节点处理 + if (n->child_num > 1) + write_normal_node_data(n, idata, 0); // 写入正常节点数据 +} + +/* + header-1B: 2bit子节点类型,1bit父子hits是否一致,2bit hits字节,3bit子节点数据偏移量字节, +*/ +static void build_index_from_node(Node* n, BlockData* idata) { + if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点 + return; + } + uint32_t i = 0; + _bd_claim(*idata, n->node_bytes); // 确保有足够的内存空间 + + // 先处理当前节点 + // 1. 写入header + // 2. 如果是根节点,写入sa_pos + // 3. 写入hits + // 4. 写入mark + // 5. 写入子节点sa_pos偏移量和子节点数据偏移量 + uint32_t node_self_size = idata->size; + if (n->seq_len > 1) { + write_path_node_data(n, idata); // 写入path节点数据 + } else { + write_normal_node_data(n, idata, 1); // 写入正常节点数据 + } + node_self_size = idata->size - node_self_size; // 当前节点数据大小 + if (node_self_size != n->node_bytes - n->child_bytes) { + fprintf(stderr, "Error: node bytes not match: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n", node_self_size, + n->node_bytes - n->child_bytes, n->node_bytes, n->depth, n->max_child_depth, n->seq_len, n->hits, n->child_num, + n->hits_neq, (int)n->type); + Node* p = n; + fprintf(stderr, "Info: father neq: %u, %u, %u\n", p->parent->hits_neq, p->parent->child_num, p->parent->hits); + while (p != NULL) { + fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, + p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); + p = p->children[0]; // 只打印第一个子节点 + } + p = n; + while (p->depth > 0) { + fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, + p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); + p = p->parent; // 向上打印父节点 + } + fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth, + p->child_num, p->hits_neq, p->seq_len, p->max_child_depth); + + exit(0); + } + + for (i = 0; i < n->child_num; ++i) { // 这里需要按照匹配概率来修改 + build_index_from_node(n->children[i], idata); // 递归构建子节点 + } +} + +// 多线程创建hybrid index +static void* thread_build_hyb_index(void* arg) { + PartKmer* p = (PartKmer*)arg; + uint8_t kmer[HYB_KMER_LEN + 1]; + uint64_t all_kmer_num = p->kmer_end - p->kmer_start; + uint64_t percent_mod = all_kmer_num / 100; + int percent = 0; + if (percent_mod == 0) + percent_mod = 1; + + uint8_t type_hits = 0; + uint64_t ref_pos = 0; + uint64_t sa_forward = 0; + uint64_t idata_offset = 0; + uint64_t packed_data = 0; + uint8_t* ptr = (uint8_t*)&packed_data; + uint64_t i, j, kmer_size; + fprintf(stderr, "[thread %d] start building hybrid index for kmer range [%lu, %lu)\n", p->tid, p->kmer_start, p->kmer_end); + for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) { + uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_BYTES; + for (j = 0; j < HYB_KMER_LEN; ++j) { + kmer[j] = (i >> (j << 1)) & 3; + } + bwtintv_t iv = bwt_search_kmer(p->bwt, kmer); + const uint32_t khits = iv.x[2]; // 有多少个匹配的位置 + if (khits == 0) { + // no extra bytes + packed_data = 0; + for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); + } else if (khits == 1) { // 只有一个匹配,直接保存对应的参考基因位置 + // no extra bytes + type_hits = 1; + ref_pos = bwt_get_sa(p->bwt->byte_sa, iv.x[0]); // 直接记录kmer对应的reference pos + packed_data = (ref_pos << HYB_KMER_DATA_TYPE_BITS) | type_hits; + // 这里需要注意,packed_data是小端模式的, 5个字节 + for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); + } else if (khits == 2) { // 只有两个匹配,直接保存对应的suffix array位置 + // no extra bytes + type_hits = 2; + sa_forward = iv.x[0]; // 记录kmer对应的后缀数组的位置 + packed_data = (sa_forward << HYB_KMER_DATA_TYPE_BITS) | type_hits; + for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); + } else { + Node* root = (Node*)calloc(1, sizeof(Node)); + root->seq[root->seq_len++] = kmer[HYB_KMER_LEN - 1]; + build_hyb_tree(p, root, &iv, 0, HYB_MAX_SEQ_LEN - HYB_KMER_LEN); // 创建基数树 + update_hyb_tree(p, root); // 更新hybrid tree + idata_offset = p->idata.size; // 指向当前要用的地址偏移量(起始位置) + calc_node_bytes(root); // 计算每个节点所占内存 + build_index_from_node(root, &p->idata); // 构建hybrid index + // 将type和地址偏移信息写入kmer + if (khits <= HYB_HIT_THRESH) { // hits数量小于等于30 + type_hits = khits; // hits数量 + } else { + type_hits = HYB_HIT_THRESH + 1; + } + packed_data = (idata_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits; + for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); + p->bytes += root->node_bytes; // 累加当前节点所占内存 + release_node(root); // 释放所有子节点和当前节点 + } + + if (p->tid == 0 && kmer_size % percent_mod == 0) { + fprintf(stderr, "[thread %d] %d %% done, bytes: %fM\n", p->tid, percent++, (double)p->bytes / 1024 / 1024); + } + } + + fprintf(stderr, "Finish-[thread %d] bytes: %fM\n", p->tid, (double)p->bytes / 1024 / 1024); + return NULL; +} + +// 多线程调整hybrid index中的block data偏移地址,整合成连续的一块地址空间 +static void* thread_adjust_hyb_index(void* arg) { + PartKmer* p = (PartKmer*)arg; + uint64_t all_kmer_num = p->kmer_end - p->kmer_start; + uint64_t percent_mod = all_kmer_num / 100; + int percent = 0; + if (percent_mod == 0) percent_mod = 1; + uint8_t type_hits = 0; + uint64_t bd_offset = 0; + uint64_t packed_data = 0; + uint8_t* ptr = (uint8_t*)&packed_data; + uint64_t i, kmer_size; + for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) { + uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_TYPE_BITS; + type_hits = *(kd)&HYB_KMER_DATA_TYPE_MASK; + if (type_hits > 2) { // block data index数据,需要改offset + // bd_offset = ((*(uint64_t *)kd) >> kTypeBits) & p->KMER_MASK; // 指向当前要用的地址偏移量 + bd_offset = ((*(uint64_t*)kd) & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; // 指向当前要用的地址偏移量 + bd_offset += p->prev_bd_size; + packed_data = (bd_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits; + for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off); + } + + if (p->tid == 0 && kmer_size % percent_mod == 0) { + fprintf(stderr, "Info: %d%% done adjust.\n", percent++); + } + } + return NULL; +} + +// 保存hybrid index +static void save_hybrid_idx(const char* prefix, PartKmerVec* pkArr) { + // fprintf(stderr, "size: %ld\n", kv_size(*pkArr)); + // return; + char fn[MAX_PATH]; + char fn2[MAX_PATH]; + + sprintf(fn, "%s.hyb.kmer", prefix); + sprintf(fn2, "%s.hyb.data", prefix); + + FILE* fpk = fopen(fn, "wb"); + FILE* fpbd = fopen(fn2, "wb"); + + if (fpk == NULL) { + fprintf(stderr, "Error: open file %s failed\n", fn); + return; + } + if (fpbd == NULL) { + fprintf(stderr, "Error: open file %s failed", fn2); + return; + } + + uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1)); + err_fwrite(kv_A(*pkArr, 0).kmer_data, 1, max_kmer * HYB_KMER_DATA_BYTES, fpk); // 小端 + int i; + for (i = 0; i < kv_size(*pkArr); ++i) { + err_fwrite(kv_A(*pkArr, i).idata.addr, 1, kv_A(*pkArr, i).idata.size, fpbd); + } + fclose(fpk); + fclose(fpbd); +} + +// 构建hybrid-index入口函数 +int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix) { + PartKmerVec pk_arr; + kv_init(pk_arr); + kv_resize(PartKmer, pk_arr, num_threads); + pk_arr.n = num_threads; + + kvec_t(pthread_t) tid_arr; + kv_init(tid_arr); + kv_resize(pthread_t, tid_arr, num_threads); + tid_arr.n = num_threads; + + uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1)); + uint8_t* kmer_data = (uint8_t*)malloc(max_kmer * HYB_KMER_DATA_BYTES); + + int i; + for (i = 0; i < num_threads; ++i) { + PartKmer* p = &kv_A(pk_arr, i); + p->tid = i; + p->kmer_start = KMER_START(i, max_kmer, num_threads); + p->kmer_end = KMER_END(i, max_kmer, num_threads); + p->bytes = 0; + p->kmer_data = kmer_data; + p->prev_bd_size = 0; + _bd_init(p->idata); + p->bwt = bwt; + pthread_create(&kv_A(tid_arr, i), NULL, thread_build_hyb_index, p); + } + + uint64_t all_bytes = 0; + uint64_t prev_bd_size = 0; + for (i = 0; i < num_threads; ++i) { + pthread_join(kv_A(tid_arr, i), NULL); + PartKmer* p = &kv_A(pk_arr, i); + all_bytes += p->bytes; + p->prev_bd_size = prev_bd_size; + prev_bd_size += p->idata.size; + } + + for (i = 0; i < num_threads; ++i) { + pthread_create(&kv_A(tid_arr, i), NULL, thread_adjust_hyb_index, &kv_A(pk_arr, i)); + } + for (int i = 0; i < num_threads; ++i) { + pthread_join(kv_A(tid_arr, i), NULL); + } + + fprintf(stderr, "Hybrid index total bytes: %fM\n", (double)all_bytes / 1024 / 1024); + + double time_sava_hybrid_idx = realtime(); + save_hybrid_idx(idx_prefix, &pk_arr); + time_sava_hybrid_idx = realtime() - time_sava_hybrid_idx; + + fprintf(stderr, "Time save hybrid index: %fs\n", time_sava_hybrid_idx); + + kv_destroy(pk_arr); + kv_destroy(tid_arr); + + return 0; +} \ No newline at end of file diff --git a/hyb_idx.h b/hyb_idx.h index eb43c05..6d8a7db 100644 --- a/hyb_idx.h +++ b/hyb_idx.h @@ -9,3 +9,173 @@ #include "kvec.h" #include "utils.h" +/////////////////////////////////////////// +// 宏定义 +// 数据 +#define HYB_KMER_LEN 16 +#define HYB_KMER_MASK 4294967295ULL +#define HYB_KMER_DATA_BYTES 5 // 5bytes +#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes +#define HYB_KMER_DATA_TYPE_BITS 5 +#define HYB_KMER_DATA_TYPE_MASK 31 +#define HYB_HIT_THRESH 30 +#define HYB_NODE_SA_MASK 1099511627775ULL +#define HYB_MAX_SEQ_LEN 301 +#define HYB_LEAF_NODE 7 + +// 节点类型 +#define HYB_BP_1 0 +#define HYB_BP_2 1 +#define HYB_BP_3 2 +#define HYB_BP_PATH 3 + +static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码 +static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码 +static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码 + +// 函数 +#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK) + +#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)}; + +#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos)) +/////////////////////////////////////// +// 结构体 + +// hybrid-index数据结构 +typedef struct { + uint8_t* kmer_data; // kmer数据 + uint8_t* index_data; // 二级索引数据 + uint8_t* sa; // 后缀数组 + uint8_t* ref_bits; // 2-bit编码的ref序列 + uint64_t ref_len; // ref长度 +} HybridIndex; + +// Read序列需要的数据 +typedef struct { + int len; // read长度 + uint8_t* seq; // 每个碱基占一个字节(0-A, 1-C, 2-G, 3-G, 4-N) + uint8_t* rseq; // 反向互补序列 + uint8_t* for_bits; // 正向序列 2-bit编码 + uint8_t* back_bits; // 反向互补序列 2-bit编码 + int id; // for test; + char* seqstr; +} ReadSeq; + +typedef kvec_t(ReadSeq) ReadSeqArr; + +// seeding过程生成的SMEM +typedef struct { + int first_len; + int seed_start; // 左闭右开区间 + int seed_end; + int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos) + int read_end; + uint64_v ref_pos_arr; +} HybSeed; + +// smem数组 +typedef kvec_t(HybSeed) HybSeedArr; + +// read区间,用来处理包含N的read +// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos) +typedef struct { + int start; + int end; + int len; +} Range; + +#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;} + +#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end) + +#define _range_cross(r1, r2) \ + (((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end)) + +// read区间数组 +typedef kvec_t(Range) RangeArr; + +#define __check_add_seed(seed) \ + HybSeed* seed = kv_pushp(HybSeed, *seeds); \ + if (seeds->m > *seeds_cap) { \ + uint64_t i = *seeds_cap; \ + for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \ + *seeds_cap = seeds->m; \ + } \ + seed->ref_pos_arr.n = 0; + +#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \ + __check_add_seed(seed_var); \ + kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \ + seed_var->first_len = 0; \ + seed_var->seed_start = s_start; \ + seed_var->seed_end = s_end; \ + seed_var->read_start = read_range->start; \ + seed_var->read_end = read_range->end; + +#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \ + if ((s_end) - (s_start) >= min_seed_len) { \ + __add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \ + } + +#define __copy_seed(src, dst) \ + do { \ + (dst).seed_start = (src).seed_start; \ + (dst).seed_end = (src).seed_end; \ + (dst).read_start = (src).read_start; \ + (dst).read_end = (src).read_end; \ + int i; \ + for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \ + } while (0) + +/////////////////////////////////////////////////////// +// functions + +// 加载hybrid index +HybridIndex* load_hybrid_idx(const char* prefix); + +// 创建正向反向互补bits +void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re); + +// 用于将seq和ref正向直接比较,返回匹配的长度 +int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len); + +// 用于将seq和ref反向直接比较,返回匹配的长度 +int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos); + +// 根据sa的行获取对应的ref position(小端模式) +uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row); + +// 解析第一个节点, 返回后续对应的节点地址 +uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, + uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid); + +// 解析后续的正常节点 +uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, + uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid); +void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits, + int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid); +void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp, + int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid); +void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p, + uint64_t* sa_start_p, uint8_t* cmp_ref, int tid); +void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset); + +void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match); +void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match); +void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits, + int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match); + +// 用hybrid-index来寻找smem(seeding-1 & seeding-2) +void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, + HybSeedArr* seeds, int tid); + +int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start, + int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end, + const int min_seed_len, HybSeedArr* seeds, int tid); + +// for seeding-3 +void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range, + const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid); diff --git a/profiling.c b/profiling.c index d591721..b27d055 100644 --- a/profiling.c +++ b/profiling.c @@ -12,9 +12,10 @@ Date : 2024/04/06 #include "profiling.h" #include "debug.h" +uint64_t proc_freq = 1000; + #ifdef SHOW_PERF uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0}; -uint64_t proc_freq = 1000; uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0}; #endif