完成了hybrid index的创建代码,还没测试
This commit is contained in:
parent
74b464ee4a
commit
caf01ce0f3
|
|
@ -92,13 +92,12 @@
|
|||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/hbwa",
|
||||
"program": "${workspaceRoot}/hybalign",
|
||||
"args": [
|
||||
"bwt2hybrid",
|
||||
"-e",
|
||||
"bwt2hyb",
|
||||
"-t",
|
||||
"1",
|
||||
"~/data/fmt_ref/human_g1k_v37_decoy.fasta"
|
||||
"./index/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
|
|
|
|||
7
bwamem.c
7
bwamem.c
|
|
@ -106,7 +106,12 @@ mem_opt_t *mem_opt_init()
|
|||
o->max_chain_extend = 1<<30;
|
||||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
|
||||
o->use_bwt = 0;
|
||||
o->skip_entire_match = 0;
|
||||
o->batch_size = 256;
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
/***************************
|
||||
|
|
|
|||
3
bwamem.h
3
bwamem.h
|
|
@ -81,6 +81,9 @@ typedef struct {
|
|||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
int batch_size; // number of bases to process in each batch
|
||||
int use_bwt; // whether to use bwt index for seeding
|
||||
int skip_entire_match; // whether to skip the second and third seeding steps for entire matching reads
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
|
|||
17
fastmap.c
17
fastmap.c
|
|
@ -150,13 +150,13 @@ int main_mem(int argc, char *argv[])
|
|||
mem_pestat_t pes[4];
|
||||
ktp_aux_t aux;
|
||||
|
||||
memset(&aux, 0, sizeof(ktp_aux_t));
|
||||
memset(&aux, 0, sizeof(ktp_aux_t));
|
||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||
for (i = 0; i < 4; ++i) pes[i].failed = 1;
|
||||
|
||||
aux.opt = opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:we")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == '1') no_mt_io = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
|
|
@ -254,8 +254,12 @@ int main_mem(int argc, char *argv[])
|
|||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
|
||||
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
|
||||
}
|
||||
else return 1;
|
||||
} else if (c == 'b')
|
||||
opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256;
|
||||
else if (c == 'w')
|
||||
opt->use_bwt = 1;
|
||||
else if (c == 'e') opt->skip_entire_match = 1;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
if (rg_line) {
|
||||
|
|
@ -320,7 +324,10 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
|
||||
fprintf(stderr, " FR orientation only. [inferred]\n");
|
||||
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -b INT batch size of reads to process at one time [%d].\n", opt->batch_size);
|
||||
fprintf(stderr, " -w Use bwt index for seeding\n");
|
||||
fprintf(stderr, " -e Skip the second and third seeding steps for entire matching reads.\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
|
|
|
|||
47
hyb_bwa.c
47
hyb_bwa.c
|
|
@ -117,7 +117,7 @@ int bwa_extract_byte_sa(int argc, char* argv[]) {
|
|||
}
|
||||
}
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n", sa_intv);
|
||||
fprintf(stderr, "Usage: bwa extractbytesa [-i %d] <in.bwt> <in.full_bytesa> <out.bytesa>\n\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
|
@ -206,25 +206,56 @@ void bwt_create_kmer_idx(bwt_t* bwt) {
|
|||
free(seq_arr);
|
||||
}
|
||||
|
||||
// 从bwt中创建kmer index,并保存到文件
|
||||
int bwa_bwt2kmer(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n");
|
||||
fprintf(stderr, "Usage: bwa extractbytesa <in.bwt> <out.kmer>\n\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
||||
bwt_create_kmer_idx(bwt);
|
||||
|
||||
bwt_dump_kmer_idx(argv[optind + 1], &bwt->kmer_hash);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2hyb(int argc, char* argv[]) {}
|
||||
// 创建hybrid index,并保存到文件
|
||||
int bwa_bwt2hyb(int argc, char* argv[]) {
|
||||
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix);
|
||||
bwt_t* bwt;
|
||||
int num_threads = 1;
|
||||
char c;
|
||||
int error = 0;
|
||||
while ((c = getopt(argc, argv, "t:")) >= 0) {
|
||||
switch (c) {
|
||||
case 't':
|
||||
num_threads = atoi(optarg);
|
||||
assert(num_threads > 0 && num_threads < 512);
|
||||
break;
|
||||
default:
|
||||
error = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (optind + 1 > argc || error) {
|
||||
fprintf(stderr, "Usage: bwa bwt2hyb [Options] <bwt-prefix>\n\n");
|
||||
fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads);
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
char fn[MAX_PATH];
|
||||
snprintf(fn, MAX_PATH, "%s.bwt", argv[optind]);
|
||||
bwt = bwt_restore_bwt(fn);
|
||||
snprintf(fn, MAX_PATH, "%s.kmer", argv[optind]);
|
||||
bwt->kmer_hash = bwt_restore_kmer_idx(fn);
|
||||
snprintf(fn, MAX_PATH, "%s.hyb.bytesa", argv[optind]);
|
||||
_load_file_to_data(fn, bwt->byte_sa);
|
||||
hyb_idx_build_and_dump(num_threads, bwt, argv[optind]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
|
||||
uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
|
||||
uint64_t qbit = 0, i;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i) {
|
||||
|
|
@ -239,7 +270,7 @@ inline uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int
|
|||
int hyb_test(int argc, char* argv[]) {
|
||||
bwt_t* bwt;
|
||||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n");
|
||||
fprintf(stderr, "Usage: bwa hybtest <idx_prefix>\n\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
866
hyb_create_idx.c
866
hyb_create_idx.c
|
|
@ -0,0 +1,866 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2025- Zhonghai Zhang (ICT)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#include "bwt.h"
|
||||
#include "hyb_idx.h"
|
||||
#include "kvec.h"
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define KMER_START(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)(tid) / (uint64_t)(num_thread))
|
||||
#define KMER_END(tid, max_kmer, num_thread) ((uint64_t)(max_kmer) * (uint64_t)((tid) + 1) / (uint64_t)(num_thread))
|
||||
|
||||
///////////////////////////
|
||||
|
||||
typedef struct {
|
||||
uint64_t size;
|
||||
uint64_t cap;
|
||||
uint8_t* addr;
|
||||
} BlockData;
|
||||
|
||||
#define _bd_init(bd) \
|
||||
do { \
|
||||
(bd).size = 0; \
|
||||
(bd).cap = 0; \
|
||||
(bd).addr = 0; \
|
||||
} while (0)
|
||||
|
||||
#define _bd_claim(bd, add_size) \
|
||||
do { \
|
||||
if ((bd).size + add_size >= (bd).cap) { \
|
||||
(bd).cap += add_size + 4L * 1024 * 1024; \
|
||||
(bd).addr = (uint8_t*)realloc((bd).addr, (bd).cap); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define _bd_addr(bd) ((bd).addr + (bd).size)
|
||||
|
||||
#define _bd_destory(bd) \
|
||||
do { \
|
||||
free((bd).addr); \
|
||||
} while (0)
|
||||
|
||||
typedef struct {
|
||||
uint32_t idx;
|
||||
uint32_t cnt;
|
||||
uint32_t bytes;
|
||||
} idx_cnt_t;
|
||||
|
||||
// 构建hyb-index时候用到的节点数据结构
|
||||
typedef struct _Node {
|
||||
uint8_t header; // 1字节头部信息 [0]
|
||||
uint8_t hit_bytes; // 用多少bytes表示hits
|
||||
uint8_t off_bytes; // 用多少bytes表示子节点数据偏移量
|
||||
|
||||
uint8_t hits_neq; // 和子节点hits数量不相等
|
||||
uint8_t hits_neq1; // 子节点和孙子节点hits数量不相等
|
||||
uint8_t max_depth_leaf; // 是否达到最大深度的叶节点
|
||||
uint8_t normal_leaf; // 正常的叶子节点,hits=1
|
||||
|
||||
uint16_t seq_len; // 单一分支碱基序列长度,如果分支,那就是1
|
||||
uint16_t depth; // 当前节点对应的位置,0对应着16(不包含16-kmer)
|
||||
uint16_t min_child_depth; // 最近到hit=1的深度
|
||||
uint16_t max_child_depth; // 最远到hit=1或者到达最大深度时的深度
|
||||
|
||||
uint32_t hits; // 匹配个数
|
||||
uint32_t child_num; // 有多少分支子节点, 第一层子节点个数
|
||||
uint32_t num_offspring; // 所有后代节点个数
|
||||
uint32_t min_offspring_hits; // 所有后代中,最后最少的hits
|
||||
uint32_t node_bytes; // 该节点包含所有后代所占内存
|
||||
uint32_t child_bytes; // 子节点所占内存
|
||||
|
||||
uint8_t type; // 当前节点类型
|
||||
uint64_t mark; // 与type对应,4bit,16bit,64bit
|
||||
uint64_t sa_pos; // 对应的sa起始位置
|
||||
uint8_t seq[320]; // 保存所有单一分支的碱基序列
|
||||
|
||||
uint8_t kmer; // 当前节点对应的kmer,root节点是uint64,用这个不准确
|
||||
|
||||
struct _Node* children[64]; // 子节点
|
||||
struct _Node* parent; // 父节点
|
||||
} Node;
|
||||
|
||||
// 并行构建hyb-index的数据结构
|
||||
typedef struct {
|
||||
pthread_t ptid; // 线程ID
|
||||
int tid; // 线程id
|
||||
uint64_t kmer_start; // [) 左闭右开
|
||||
uint64_t kmer_end;
|
||||
uint64_t bytes; // 当前线程所有kmer数据占用的内存字节数
|
||||
uint8_t* kmer_data; // kmer数据
|
||||
uint64_t prev_bd_size; // 前一个kmer对应的index数据所占内存大小,用来调整多线程kmer数据对应的地址偏移量的
|
||||
BlockData idata; // 二级索引数据
|
||||
bwt_t* bwt; // bwt索引
|
||||
} PartKmer;
|
||||
|
||||
typedef kvec_t(PartKmer) PartKmerVec;
|
||||
|
||||
// 当hit变成1后,继续扩展节点,为了保存2-kmer或者3-kmer对应的匹配信息
|
||||
typedef struct {
|
||||
Node* node;
|
||||
uint64_t mark;
|
||||
} MarkNode;
|
||||
|
||||
/////////////////
|
||||
|
||||
// 正向搜索kmer(hyb的16-kmer)在bwt中的区间信息
|
||||
bwtintv_t bwt_search_kmer(const bwt_t* bwt, const uint8_t* q) {
|
||||
bwtintv_t ik = {0}, ok[4] = {0};
|
||||
int i, x, c;
|
||||
uint32_t qbit = 0;
|
||||
x = HASH_KMER_LEN;
|
||||
for (i = 0; i < x; ++i) {
|
||||
qbit |= q[i] << ((HASH_KMER_LEN - 1 - i) << 1);
|
||||
}
|
||||
bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, x - 1);
|
||||
if (ik.x[2] < 1)
|
||||
return ik;
|
||||
for (i = x; i < HYB_KMER_LEN; ++i) {
|
||||
c = 3 - q[i];
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] < 1)
|
||||
return ok[c];
|
||||
ik = ok[c];
|
||||
}
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 根据子节点更新父节点信息
|
||||
static void update_parent(Node* n, Node* c, uint64_t k, int update_max) {
|
||||
// 创建完子节点后,更新一些数据
|
||||
n->children[n->child_num++] = c;
|
||||
c->kmer = k;
|
||||
c->parent = n;
|
||||
n->mark |= (1ULL << k); // 设置当前节点的mark
|
||||
n->min_offspring_hits = MIN(n->min_offspring_hits, c->min_offspring_hits);
|
||||
n->min_child_depth = MIN(n->min_child_depth, c->min_child_depth + 1);
|
||||
if (update_max)
|
||||
n->max_child_depth = MAX(n->max_child_depth, c->max_child_depth + 1);
|
||||
n->num_offspring += c->num_offspring + 1;
|
||||
if (c->hits_neq)
|
||||
n->hits_neq1 = 1; // 子节点和孙子节点hits数量不相等
|
||||
}
|
||||
|
||||
// 清除父节点的一些信息, 不释放内存空间
|
||||
static void clear_parent(Node* n) {
|
||||
uint32_t i = 0;
|
||||
n->max_child_depth = 0;
|
||||
for (; i < n->child_num; ++i) n->children[i] = NULL; // 清除子节点
|
||||
n->child_num = 0;
|
||||
n->mark = 0;
|
||||
n->num_offspring = 0;
|
||||
n->hits_neq1 = 0;
|
||||
}
|
||||
|
||||
// 更新节点信息
|
||||
static void update_node_info(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth, int update_min) {
|
||||
n->depth = depth; // 当前节点对应的深度
|
||||
n->min_offspring_hits = ik->x[2]; // 所有子节点中最小的hits数量
|
||||
n->sa_pos = ik->x[0]; // bwt_get_sa(pk->fmt->sa, ik->x[0]); // 对应的sa中第一个位置
|
||||
n->hits = ik->x[2]; // 匹配数量
|
||||
if (update_min)
|
||||
n->min_child_depth = max_depth; // 初始化为当前节点所能达到的最大深度(人为设定的阈值)
|
||||
}
|
||||
|
||||
// 普通的leaf节点扩展,扩展到第2层和第3层
|
||||
static void extend_leaf_node(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
|
||||
n->normal_leaf = 1;
|
||||
update_node_info(pk, n, ik, depth, max_depth, 0);
|
||||
if (max_depth == 0) {
|
||||
return;
|
||||
}
|
||||
uint8_t k;
|
||||
int child_hits = 0;
|
||||
bwtintv_t ok[4] = {0};
|
||||
bwt_extend(pk->bwt, ik, ok, 0);
|
||||
for (k = 0; k < 4; ++k) {
|
||||
bwtintv_t iv = ok[3 - k];
|
||||
child_hits += iv.x[2];
|
||||
if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点
|
||||
Node* child = (Node*)calloc(1, sizeof(Node));
|
||||
child->seq[child->seq_len++] = k; // 保存序列
|
||||
// 递归构建子节点
|
||||
extend_leaf_node(pk, child, &iv, depth + 1, max_depth - 1);
|
||||
// 创建完子节点后,更新一些数据
|
||||
update_parent(n, child, k, 0); // 更新父节点信息
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (child_hits != (int)n->hits) {
|
||||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||||
}
|
||||
}
|
||||
|
||||
// 递归构建hyb-tree
|
||||
// n: 父节点
|
||||
// depth: kmer之后该碱基在第几个(位置/深度)
|
||||
// max_depth: 遍历到最大深度停止(最长有多少碱基, 从当前节点算起)
|
||||
static void build_hyb_tree(PartKmer* pk, Node* n, bwtintv_t* ik, int depth, int max_depth) {
|
||||
/* 递归终止条件 */
|
||||
if (ik->x[2] == 1) { // 如果hits为0或者1,则不再需要子节点
|
||||
n->min_child_depth = 0;
|
||||
n->normal_leaf = 1; // 正常的叶子节点,hits=1
|
||||
extend_leaf_node(pk, n, ik, depth, 2); // 继续扩展两个碱基
|
||||
return;
|
||||
}
|
||||
// 给节点n赋值
|
||||
update_node_info(pk, n, ik, depth, max_depth, 1);
|
||||
if (max_depth == 0) { // 达到最大深度,不再进行扩展
|
||||
n->min_child_depth = 0;
|
||||
n->max_depth_leaf = 1; // 达到最大深度的叶节点
|
||||
return;
|
||||
}
|
||||
int child_hits = 0; // 所有子节点hits数量之和
|
||||
uint8_t k; // 碱基
|
||||
bwtintv_t ok[4] = {0};
|
||||
bwt_extend(pk->bwt, ik, ok, 0);
|
||||
for (k = 0; k < 4; ++k) { // 1-kmer
|
||||
bwtintv_t iv = ok[3 - k];
|
||||
child_hits += iv.x[2];
|
||||
if (iv.x[2] > 0) { // hits必须大于0,即有匹配的位置,才创建子节点
|
||||
Node* child = (Node*)calloc(1, sizeof(Node));
|
||||
child->seq[child->seq_len++] = k;
|
||||
// 递归构建子节点
|
||||
build_hyb_tree(pk, child, &iv, depth + 1, max_depth - 1);
|
||||
// 创建完子节点后,更新一些数据
|
||||
update_parent(n, child, k, 1); // 更新父节点信息
|
||||
}
|
||||
}
|
||||
// 子节点hits之和应该和父节点hits数量相等,否则,就是当前父节点有一个匹配到了ref的结尾
|
||||
if (child_hits != (int)n->hits) {
|
||||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||||
}
|
||||
}
|
||||
|
||||
// 将ptr的子节点赋值给n,并释放ptr节点
|
||||
static void assign_children(Node* n, Node* ptr) {
|
||||
clear_parent(n);
|
||||
n->seq[n->seq_len++] = ptr->seq[0]; // 保存序列
|
||||
uint32_t i = 0;
|
||||
for (; i < ptr->child_num; ++i) {
|
||||
update_parent(n, ptr->children[i], ptr->children[i]->seq[0], 1);
|
||||
}
|
||||
free(ptr); // 释放ptr节点
|
||||
}
|
||||
|
||||
// 处理path node
|
||||
static void handle_path_node(Node** n) {
|
||||
Node* cur = *n;
|
||||
Node* ptr = cur->children[0]; // 只有一个子节点
|
||||
while (ptr->child_num == 1 && cur->hits == ptr->hits) { // 继续向下遍历,直到没有子节点, 或父子hits不一致
|
||||
cur->seq[cur->seq_len++] = ptr->seq[0]; // 保存序列
|
||||
Node* next = ptr->children[0]; // 获取下一个子节点
|
||||
free(ptr); // 释放当前节点
|
||||
ptr = next;
|
||||
}
|
||||
if (ptr->child_num == 1) { // hits不一致
|
||||
clear_parent(cur);
|
||||
update_parent(cur, ptr, ptr->seq[0], 1);
|
||||
cur->hits_neq = 1; // 继承子节点的hits_neq
|
||||
ptr->seq[ptr->seq_len++] = ptr->seq[0]; // 多保存一次该节点的碱基,因为他也是path节点
|
||||
*n = ptr;
|
||||
handle_path_node(n); // 递归处理下一个path节点
|
||||
} else {
|
||||
assign_children(cur, ptr); // 将ptr的子节点赋值给n
|
||||
}
|
||||
}
|
||||
|
||||
// dfs遍历当前节点的子节点,扩展到2或3层,然后去掉中间层节点,将后续节点直接与父节点连接, layer = 2 or 3
|
||||
static void shrink_node_layer(PartKmer* pk, Node* n, int layer) {
|
||||
kvec_t(MarkNode) layer_nodes_stack;
|
||||
kv_init(layer_nodes_stack); // dfs遍历的stack
|
||||
kvec_t(MarkNode) kmer_stack;
|
||||
kv_init(kmer_stack); // 装所有的kmer
|
||||
kvec_t(Node*) to_release;
|
||||
kv_init(to_release); // 装所有的待删除的node
|
||||
|
||||
int i;
|
||||
// init stack
|
||||
for (i = (int)n->child_num - 1; i >= 0; --i) {
|
||||
MarkNode ln;
|
||||
ln.node = n->children[i];
|
||||
ln.mark = 1; // 第一层子节点
|
||||
kv_push(MarkNode, layer_nodes_stack, ln);
|
||||
}
|
||||
// dfs 遍历
|
||||
while (kv_size(layer_nodes_stack) > 0) {
|
||||
MarkNode ln = kv_pop(layer_nodes_stack);
|
||||
if ((int)ln.mark == layer) { // 是第3层的节点,创建对应的kmer
|
||||
uint8_t k = 0;
|
||||
if (layer == 2)
|
||||
k = ln.node->parent->seq[0] << 2 | ln.node->seq[0];
|
||||
else if (layer == 3)
|
||||
k = ln.node->parent->parent->seq[0] << 4 | ln.node->parent->seq[0] << 2 | ln.node->seq[0];
|
||||
MarkNode kmer_node;
|
||||
kmer_node.node = ln.node;
|
||||
kmer_node.mark = k; // 保存kmer
|
||||
kv_push(MarkNode, kmer_stack, kmer_node); // 保存kmer
|
||||
} else {
|
||||
kv_push(Node*, to_release, ln.node); // 保存当前节点,待释放
|
||||
for (i = (int)ln.node->child_num - 1; i >= 0; --i) {
|
||||
MarkNode lnn;
|
||||
lnn.node = ln.node->children[i];
|
||||
lnn.mark = ln.mark + 1;
|
||||
kv_push(MarkNode, layer_nodes_stack, lnn);
|
||||
}
|
||||
}
|
||||
}
|
||||
// 处理所有kmer
|
||||
clear_parent(n);
|
||||
int child_hits = 0;
|
||||
for (i = 0; i < (int)kv_size(kmer_stack); ++i) {
|
||||
MarkNode kmer_node = kv_A(kmer_stack, i);
|
||||
child_hits += kmer_node.node->hits; // 统计子节点的hits数量
|
||||
update_parent(n, kmer_node.node, kmer_node.mark, 1); // 更新父节点信息
|
||||
}
|
||||
if (child_hits != (int)n->hits) {
|
||||
n->hits_neq = 1; // 当前碱基有一个匹配到了ref的结尾,导致节点hits和子节点hits总和不等
|
||||
}
|
||||
for (i = 0; i < (int)kv_size(to_release); ++i) {
|
||||
free(kv_A(to_release, i));
|
||||
}
|
||||
kv_destroy(layer_nodes_stack);
|
||||
kv_destroy(kmer_stack);
|
||||
kv_destroy(to_release);
|
||||
}
|
||||
|
||||
// 构建完hybrid tree之后,从根节点开始遍历,设置节点的类型,构建path节点,3,2,1 kmer节点
|
||||
static void update_hyb_tree(PartKmer* pk, Node* n) {
|
||||
if (n->child_num == 1 && !n->normal_leaf && !n->max_depth_leaf && !(n->max_child_depth <= 3)) { // path类型
|
||||
handle_path_node(&n);
|
||||
}
|
||||
uint32_t i = 0;
|
||||
#define __recursive_update_hyb_tree(pk, n) \
|
||||
for (i = 0; i < n->child_num; ++i) { \
|
||||
update_hyb_tree(pk, n->children[i]); \
|
||||
}
|
||||
// 计算子节点的idata地址,和父节点的match_cnt
|
||||
// 递归终点
|
||||
if (n->max_child_depth == 5) {
|
||||
if (n->hits_neq) {
|
||||
n->type = HYB_BP_1;
|
||||
} else {
|
||||
n->type = HYB_BP_2;
|
||||
shrink_node_layer(pk, n, 2);
|
||||
}
|
||||
__recursive_update_hyb_tree(pk, n);
|
||||
} else if (n->max_child_depth == 4) {
|
||||
n->type = HYB_BP_1;
|
||||
__recursive_update_hyb_tree(pk, n);
|
||||
} else if (n->max_child_depth == 3) {
|
||||
if (n->hits_neq) {
|
||||
n->type = HYB_BP_1;
|
||||
} else if (n->hits_neq1) {
|
||||
n->type = HYB_BP_2;
|
||||
shrink_node_layer(pk, n, 2);
|
||||
} else {
|
||||
n->type = HYB_BP_3;
|
||||
shrink_node_layer(pk, n, 3);
|
||||
}
|
||||
} else if (n->max_child_depth == 2) {
|
||||
if (n->hits_neq) {
|
||||
n->type = HYB_BP_1;
|
||||
} else {
|
||||
n->type = HYB_BP_2;
|
||||
shrink_node_layer(pk, n, 2);
|
||||
}
|
||||
} else if (n->max_child_depth == 1) {
|
||||
n->type = HYB_BP_1;
|
||||
} else {
|
||||
if (n->hits_neq) {
|
||||
n->type = HYB_BP_1;
|
||||
} else if (n->hits_neq1) {
|
||||
n->type = HYB_BP_2;
|
||||
shrink_node_layer(pk, n, 2);
|
||||
} else {
|
||||
n->type = HYB_BP_3;
|
||||
shrink_node_layer(pk, n, 3);
|
||||
}
|
||||
__recursive_update_hyb_tree(pk, n);
|
||||
}
|
||||
}
|
||||
|
||||
// 根据构建的radix tree,创建hyb-index
|
||||
static void calc_node_bytes(Node* n) {
|
||||
// 递归终点
|
||||
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
|
||||
n->node_bytes = 0;
|
||||
return;
|
||||
}
|
||||
// 先处理子节点
|
||||
uint32_t i = 0;
|
||||
for (; i < n->child_num; ++i) {
|
||||
calc_node_bytes(n->children[i]); // 递归构建子节点
|
||||
n->child_bytes += n->children[i]->node_bytes; // 累加子节点所占内存
|
||||
}
|
||||
// 处理当前节点
|
||||
n->node_bytes = n->child_bytes;
|
||||
if (n->depth == 0) n->node_bytes += 5; // sa_pos占用5字节
|
||||
|
||||
// 计算偏移地址字节
|
||||
int off_bytes = 0;
|
||||
if (n->child_bytes == 0) { // 没有子节点
|
||||
off_bytes = 0;
|
||||
} else if (n->child_bytes < 256) {
|
||||
off_bytes = 1; // 子节点数据偏移量小于256, 1B就行
|
||||
} else if (n->child_bytes < 65536) {
|
||||
off_bytes = 2; // 2B就行
|
||||
} else if (n->child_bytes < 16777216) {
|
||||
off_bytes = 3; // 3B就行
|
||||
} else if (n->child_bytes < UINT32_MAX) {
|
||||
off_bytes = 4; // 使用4B
|
||||
} else {
|
||||
fprintf(stderr, "too many child bytes: %d\n", n->child_bytes);
|
||||
exit(1);
|
||||
}
|
||||
n->off_bytes = off_bytes; // 保存子节点数据偏移量字节数
|
||||
|
||||
// 计算hits字节
|
||||
int hits_bytes = 0;
|
||||
if (n->hits < 256) {
|
||||
hits_bytes = 1; // hits小于256, 占用1字节
|
||||
} else if (n->hits < 65536) {
|
||||
hits_bytes = 2; // hits小于65536, 占用2字节
|
||||
} else if (n->hits < 16777216) {
|
||||
hits_bytes = 3; // hits小于16777216, 占用3字节
|
||||
} else if (n->child_bytes < UINT32_MAX) {
|
||||
hits_bytes = 4; // 使用4B
|
||||
} else {
|
||||
fprintf(stderr, "too many hits: %d\n", n->hits);
|
||||
exit(1);
|
||||
}
|
||||
n->hit_bytes = hits_bytes; // 保存hits字节数
|
||||
|
||||
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||||
n->node_bytes += hits_bytes; // 根节点需要保存hits
|
||||
}
|
||||
|
||||
if (n->seq_len > 1) { // path节点
|
||||
n->node_bytes += 1; // header, path节点相当于两个节点,一个path,一个普通的
|
||||
n->node_bytes += 1; // 结合header 记录碱基path长度,代替mark
|
||||
n->node_bytes += ((n->seq_len - 1) * 2 + 7) / 8; // 只有根节点保存本节点的碱基,其他的节点不保存seq[0],因为seq[0]是子节点的碱基
|
||||
}
|
||||
// 有可能一个path下边还是一个path,因为可能父子hits不一致
|
||||
if (n->child_num > 1 || n->seq_len == 1) { // 有多个子节点,或者当前节点不是path节点
|
||||
n->node_bytes += 1; // header_bits占用1字节
|
||||
// 叶节点只用bit-map来计算hits偏移就行了, 不用保存hits start信息
|
||||
int all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
|
||||
if (!all_child_is_leaf) {
|
||||
n->node_bytes += (hits_bytes + off_bytes) * (n->child_num - 1); // 不保存第一个子节点offset,因为是0
|
||||
// for test,首个节点不保存hits start和offset,因为hits start是0或者1,offset是0,这样index-data减少了1GB,总共20GB
|
||||
// 这样不能根据概率调整内存位置了
|
||||
}
|
||||
if (n->type == HYB_BP_1) { // 1-kmer节点
|
||||
n->node_bytes += 1; // 1字节的mark
|
||||
} else if (n->type == HYB_BP_2) { // 2-kmer节点
|
||||
n->node_bytes += 2; // 2字节的mark
|
||||
} else if (n->type == HYB_BP_3) { // 3-kmer节点
|
||||
n->node_bytes += 8; // 8字节的mark
|
||||
} else {
|
||||
fprintf(stderr, "Unknown node type: %d\n", (int)n->type);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 释放所有子节点和当前节点
|
||||
static void release_node(Node* p) {
|
||||
uint32_t k = 0;
|
||||
for (k = 0; k < p->child_num; k++) {
|
||||
release_node(p->children[k]);
|
||||
}
|
||||
free(p);
|
||||
}
|
||||
|
||||
void write_seq_bits(uint8_t* bs, int seq_len, BlockData* idata) {
|
||||
if (seq_len > 0) {
|
||||
uint32_t num_bytes = ((seq_len << 1) + 7) >> 3;
|
||||
uint8_t* bit_seq = (uint8_t*)_bd_addr(*idata);
|
||||
uint8_t bybp = 0;
|
||||
int i = 0, j = 0, idx = 0;
|
||||
for (; i + 3 < seq_len; i += 4) {
|
||||
bybp = bs[i] | bs[i + 1] << 2 | bs[i + 2] << 4 | bs[i + 3] << 6;
|
||||
*(uint8_t*)&bit_seq[idx++] = bybp;
|
||||
}
|
||||
bybp = 0;
|
||||
for (; i < seq_len; ++i, ++j) {
|
||||
bybp |= bs[i] << j * 2;
|
||||
}
|
||||
*(uint8_t*)&bit_seq[idx++] = bybp;
|
||||
idata->size += num_bytes; // 更新数据大小
|
||||
}
|
||||
}
|
||||
|
||||
static void write_normal_node_data(Node* n, BlockData* idata, uint8_t write_sa) {
|
||||
uint8_t* ptr;
|
||||
int i, j;
|
||||
int all_child_is_leaf = 0;
|
||||
|
||||
// 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数;
|
||||
uint8_t header = n->type << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
|
||||
// 叶节点只用bit-map来计算hits偏移就行了
|
||||
all_child_is_leaf = n->hits == n->child_num || (n->hits_neq && n->hits == n->child_num + 1);
|
||||
// 3bit子节点地址偏移字节数
|
||||
if (all_child_is_leaf) {
|
||||
header |= HYB_LEAF_NODE; // 用off_bytes = 7来表示叶节点,不用读取hits和offset信息
|
||||
} else {
|
||||
header |= n->off_bytes;
|
||||
// assert(n->off_bytes != 0); // 这里会出错,为啥有这个断言?
|
||||
}
|
||||
|
||||
// header
|
||||
idata->addr[idata->size++] = header; // 写入header
|
||||
// sa_pos
|
||||
if (n->depth == 0 && write_sa) {
|
||||
ptr = (uint8_t*)&n->sa_pos;
|
||||
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入sa_pos
|
||||
}
|
||||
// hits
|
||||
if (write_sa && n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||||
ptr = (uint8_t*)&n->hits;
|
||||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入hits
|
||||
}
|
||||
|
||||
// 处理mark
|
||||
if (n->type == HYB_BP_1) {
|
||||
idata->addr[idata->size++] = (uint8_t)(n->mark & 0xFF); // 1-kmer节点
|
||||
} else if (n->type == HYB_BP_2) {
|
||||
ptr = (uint8_t*)&n->mark;
|
||||
for (i = 0; i < 2; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 2-kmer节点
|
||||
} else if (n->type == HYB_BP_3) {
|
||||
ptr = (uint8_t*)&n->mark;
|
||||
for (i = 0; i < 8; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 3-kmer节点
|
||||
}
|
||||
|
||||
if (all_child_is_leaf) {
|
||||
return; // 叶节点只用bit-map来计算hits start就行了
|
||||
}
|
||||
|
||||
// 处理子节点
|
||||
uint32_t sa_start = 0;
|
||||
uint32_t offset = 0; // 子节点数据偏移量
|
||||
// 剩下的节点
|
||||
for (j = 1; j < (int)n->child_num; ++j) {
|
||||
sa_start += n->children[j - 1]->hits; // 累加前一个子节点的hits
|
||||
ptr = (uint8_t*)&sa_start;
|
||||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入sa_start
|
||||
if (n->children[j]->node_bytes == 0 && n->children[j]->hits > 1 && !n->children[j]->max_depth_leaf) {
|
||||
fprintf(stderr, "Error: no child! hits:%u, depth:%u, max-leaf:%u\n", n->children[j]->hits, n->children[j]->depth,
|
||||
n->children[j]->max_depth_leaf);
|
||||
}
|
||||
offset += n->children[j - 1]->node_bytes;
|
||||
ptr = (uint8_t*)&offset;
|
||||
for (i = 0; i < n->off_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入offset
|
||||
}
|
||||
}
|
||||
|
||||
// 写入path node数据,一个path node包含path和normal两种数据
|
||||
static void write_path_node_data(Node* n, BlockData* idata) {
|
||||
// 2bit子节点类型, 1bit父子hits是否一致,2bit hits字节数;
|
||||
uint8_t header = HYB_BP_PATH << 6 | n->hits_neq << 5 | (n->hit_bytes - 1) << 3;
|
||||
int path_len = n->seq_len - 1; // path节点的长度
|
||||
header |= path_len >> 8; // 此时,header最低位保存path_len的高8位(9bit足以表示path的长度)
|
||||
idata->addr[idata->size++] = header; // 写入header
|
||||
int i;
|
||||
uint8_t* ptr;
|
||||
if (n->depth == 0) {
|
||||
ptr = (uint8_t*)&n->sa_pos;
|
||||
for (i = 0; i < 5; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入sa_pos
|
||||
}
|
||||
// 处理hits
|
||||
if (n->depth == 0 && n->hits > HYB_HIT_THRESH) {
|
||||
ptr = (uint8_t*)&n->hits;
|
||||
for (i = 0; i < n->hit_bytes; ++i) *(_bd_addr(*idata) + i) = *(ptr + i);
|
||||
idata->size += i; // 写入hits
|
||||
}
|
||||
// 处理mark
|
||||
idata->addr[idata->size++] = (uint8_t)(path_len & 0xFF); // 写入mark
|
||||
// 写入seq
|
||||
write_seq_bits(n->seq + 1, path_len, idata); // 跳过seq[0],因为seq[0]是子节点的碱基
|
||||
// 继续把该节点当做正常节点处理
|
||||
if (n->child_num > 1)
|
||||
write_normal_node_data(n, idata, 0); // 写入正常节点数据
|
||||
}
|
||||
|
||||
/*
|
||||
header-1B: 2bit子节点类型,1bit父子hits是否一致,2bit hits字节,3bit子节点数据偏移量字节,
|
||||
*/
|
||||
static void build_index_from_node(Node* n, BlockData* idata) {
|
||||
if (n->normal_leaf || n->max_depth_leaf) { // 叶子节点没有子节点
|
||||
return;
|
||||
}
|
||||
uint32_t i = 0;
|
||||
_bd_claim(*idata, n->node_bytes); // 确保有足够的内存空间
|
||||
|
||||
// 先处理当前节点
|
||||
// 1. 写入header
|
||||
// 2. 如果是根节点,写入sa_pos
|
||||
// 3. 写入hits
|
||||
// 4. 写入mark
|
||||
// 5. 写入子节点sa_pos偏移量和子节点数据偏移量
|
||||
uint32_t node_self_size = idata->size;
|
||||
if (n->seq_len > 1) {
|
||||
write_path_node_data(n, idata); // 写入path节点数据
|
||||
} else {
|
||||
write_normal_node_data(n, idata, 1); // 写入正常节点数据
|
||||
}
|
||||
node_self_size = idata->size - node_self_size; // 当前节点数据大小
|
||||
if (node_self_size != n->node_bytes - n->child_bytes) {
|
||||
fprintf(stderr, "Error: node bytes not match: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n", node_self_size,
|
||||
n->node_bytes - n->child_bytes, n->node_bytes, n->depth, n->max_child_depth, n->seq_len, n->hits, n->child_num,
|
||||
n->hits_neq, (int)n->type);
|
||||
Node* p = n;
|
||||
fprintf(stderr, "Info: father neq: %u, %u, %u\n", p->parent->hits_neq, p->parent->child_num, p->parent->hits);
|
||||
while (p != NULL) {
|
||||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
|
||||
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||||
p = p->children[0]; // 只打印第一个子节点
|
||||
}
|
||||
p = n;
|
||||
while (p->depth > 0) {
|
||||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits,
|
||||
p->depth, p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||||
p = p->parent; // 向上打印父节点
|
||||
}
|
||||
fprintf(stderr, "Info: hits: %u, depth: %u, childs: %u, hits_neq: %u, seq_len: %u, max_depth: %u\n", p->hits, p->depth,
|
||||
p->child_num, p->hits_neq, p->seq_len, p->max_child_depth);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
for (i = 0; i < n->child_num; ++i) { // 这里需要按照匹配概率来修改
|
||||
build_index_from_node(n->children[i], idata); // 递归构建子节点
|
||||
}
|
||||
}
|
||||
|
||||
// 多线程创建hybrid index
|
||||
static void* thread_build_hyb_index(void* arg) {
|
||||
PartKmer* p = (PartKmer*)arg;
|
||||
uint8_t kmer[HYB_KMER_LEN + 1];
|
||||
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
|
||||
uint64_t percent_mod = all_kmer_num / 100;
|
||||
int percent = 0;
|
||||
if (percent_mod == 0)
|
||||
percent_mod = 1;
|
||||
|
||||
uint8_t type_hits = 0;
|
||||
uint64_t ref_pos = 0;
|
||||
uint64_t sa_forward = 0;
|
||||
uint64_t idata_offset = 0;
|
||||
uint64_t packed_data = 0;
|
||||
uint8_t* ptr = (uint8_t*)&packed_data;
|
||||
uint64_t i, j, kmer_size;
|
||||
fprintf(stderr, "[thread %d] start building hybrid index for kmer range [%lu, %lu)\n", p->tid, p->kmer_start, p->kmer_end);
|
||||
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
|
||||
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_BYTES;
|
||||
for (j = 0; j < HYB_KMER_LEN; ++j) {
|
||||
kmer[j] = (i >> (j << 1)) & 3;
|
||||
}
|
||||
bwtintv_t iv = bwt_search_kmer(p->bwt, kmer);
|
||||
const uint32_t khits = iv.x[2]; // 有多少个匹配的位置
|
||||
if (khits == 0) {
|
||||
// no extra bytes
|
||||
packed_data = 0;
|
||||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||||
} else if (khits == 1) { // 只有一个匹配,直接保存对应的参考基因位置
|
||||
// no extra bytes
|
||||
type_hits = 1;
|
||||
ref_pos = bwt_get_sa(p->bwt->byte_sa, iv.x[0]); // 直接记录kmer对应的reference pos
|
||||
packed_data = (ref_pos << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||||
// 这里需要注意,packed_data是小端模式的, 5个字节
|
||||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||||
} else if (khits == 2) { // 只有两个匹配,直接保存对应的suffix array位置
|
||||
// no extra bytes
|
||||
type_hits = 2;
|
||||
sa_forward = iv.x[0]; // 记录kmer对应的后缀数组的位置
|
||||
packed_data = (sa_forward << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||||
} else {
|
||||
Node* root = (Node*)calloc(1, sizeof(Node));
|
||||
root->seq[root->seq_len++] = kmer[HYB_KMER_LEN - 1];
|
||||
build_hyb_tree(p, root, &iv, 0, HYB_MAX_SEQ_LEN - HYB_KMER_LEN); // 创建基数树
|
||||
update_hyb_tree(p, root); // 更新hybrid tree
|
||||
idata_offset = p->idata.size; // 指向当前要用的地址偏移量(起始位置)
|
||||
calc_node_bytes(root); // 计算每个节点所占内存
|
||||
build_index_from_node(root, &p->idata); // 构建hybrid index
|
||||
// 将type和地址偏移信息写入kmer
|
||||
if (khits <= HYB_HIT_THRESH) { // hits数量小于等于30
|
||||
type_hits = khits; // hits数量
|
||||
} else {
|
||||
type_hits = HYB_HIT_THRESH + 1;
|
||||
}
|
||||
packed_data = (idata_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||||
p->bytes += root->node_bytes; // 累加当前节点所占内存
|
||||
release_node(root); // 释放所有子节点和当前节点
|
||||
}
|
||||
|
||||
if (p->tid == 0 && kmer_size % percent_mod == 0) {
|
||||
fprintf(stderr, "[thread %d] %d %% done, bytes: %fM\n", p->tid, percent++, (double)p->bytes / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "Finish-[thread %d] bytes: %fM\n", p->tid, (double)p->bytes / 1024 / 1024);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// 多线程调整hybrid index中的block data偏移地址,整合成连续的一块地址空间
|
||||
static void* thread_adjust_hyb_index(void* arg) {
|
||||
PartKmer* p = (PartKmer*)arg;
|
||||
uint64_t all_kmer_num = p->kmer_end - p->kmer_start;
|
||||
uint64_t percent_mod = all_kmer_num / 100;
|
||||
int percent = 0;
|
||||
if (percent_mod == 0) percent_mod = 1;
|
||||
uint8_t type_hits = 0;
|
||||
uint64_t bd_offset = 0;
|
||||
uint64_t packed_data = 0;
|
||||
uint8_t* ptr = (uint8_t*)&packed_data;
|
||||
uint64_t i, kmer_size;
|
||||
for (i = p->kmer_start, kmer_size = 0; i < p->kmer_end; ++i, ++kmer_size) {
|
||||
uint8_t* kd = p->kmer_data + i * HYB_KMER_DATA_TYPE_BITS;
|
||||
type_hits = *(kd)&HYB_KMER_DATA_TYPE_MASK;
|
||||
if (type_hits > 2) { // block data index数据,需要改offset
|
||||
// bd_offset = ((*(uint64_t *)kd) >> kTypeBits) & p->KMER_MASK; // 指向当前要用的地址偏移量
|
||||
bd_offset = ((*(uint64_t*)kd) & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS; // 指向当前要用的地址偏移量
|
||||
bd_offset += p->prev_bd_size;
|
||||
packed_data = (bd_offset << HYB_KMER_DATA_TYPE_BITS) | type_hits;
|
||||
for (int off = 0; off < 5; ++off) *(kd + off) = *(ptr + off);
|
||||
}
|
||||
|
||||
if (p->tid == 0 && kmer_size % percent_mod == 0) {
|
||||
fprintf(stderr, "Info: %d%% done adjust.\n", percent++);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// 保存hybrid index
|
||||
static void save_hybrid_idx(const char* prefix, PartKmerVec* pkArr) {
|
||||
// fprintf(stderr, "size: %ld\n", kv_size(*pkArr));
|
||||
// return;
|
||||
char fn[MAX_PATH];
|
||||
char fn2[MAX_PATH];
|
||||
|
||||
sprintf(fn, "%s.hyb.kmer", prefix);
|
||||
sprintf(fn2, "%s.hyb.data", prefix);
|
||||
|
||||
FILE* fpk = fopen(fn, "wb");
|
||||
FILE* fpbd = fopen(fn2, "wb");
|
||||
|
||||
if (fpk == NULL) {
|
||||
fprintf(stderr, "Error: open file %s failed\n", fn);
|
||||
return;
|
||||
}
|
||||
if (fpbd == NULL) {
|
||||
fprintf(stderr, "Error: open file %s failed", fn2);
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
|
||||
err_fwrite(kv_A(*pkArr, 0).kmer_data, 1, max_kmer * HYB_KMER_DATA_BYTES, fpk); // 小端
|
||||
int i;
|
||||
for (i = 0; i < kv_size(*pkArr); ++i) {
|
||||
err_fwrite(kv_A(*pkArr, i).idata.addr, 1, kv_A(*pkArr, i).idata.size, fpbd);
|
||||
}
|
||||
fclose(fpk);
|
||||
fclose(fpbd);
|
||||
}
|
||||
|
||||
// 构建hybrid-index入口函数
|
||||
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix) {
|
||||
PartKmerVec pk_arr;
|
||||
kv_init(pk_arr);
|
||||
kv_resize(PartKmer, pk_arr, num_threads);
|
||||
pk_arr.n = num_threads;
|
||||
|
||||
kvec_t(pthread_t) tid_arr;
|
||||
kv_init(tid_arr);
|
||||
kv_resize(pthread_t, tid_arr, num_threads);
|
||||
tid_arr.n = num_threads;
|
||||
|
||||
uint64_t max_kmer = (1ULL << (HYB_KMER_LEN << 1));
|
||||
uint8_t* kmer_data = (uint8_t*)malloc(max_kmer * HYB_KMER_DATA_BYTES);
|
||||
|
||||
int i;
|
||||
for (i = 0; i < num_threads; ++i) {
|
||||
PartKmer* p = &kv_A(pk_arr, i);
|
||||
p->tid = i;
|
||||
p->kmer_start = KMER_START(i, max_kmer, num_threads);
|
||||
p->kmer_end = KMER_END(i, max_kmer, num_threads);
|
||||
p->bytes = 0;
|
||||
p->kmer_data = kmer_data;
|
||||
p->prev_bd_size = 0;
|
||||
_bd_init(p->idata);
|
||||
p->bwt = bwt;
|
||||
pthread_create(&kv_A(tid_arr, i), NULL, thread_build_hyb_index, p);
|
||||
}
|
||||
|
||||
uint64_t all_bytes = 0;
|
||||
uint64_t prev_bd_size = 0;
|
||||
for (i = 0; i < num_threads; ++i) {
|
||||
pthread_join(kv_A(tid_arr, i), NULL);
|
||||
PartKmer* p = &kv_A(pk_arr, i);
|
||||
all_bytes += p->bytes;
|
||||
p->prev_bd_size = prev_bd_size;
|
||||
prev_bd_size += p->idata.size;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_threads; ++i) {
|
||||
pthread_create(&kv_A(tid_arr, i), NULL, thread_adjust_hyb_index, &kv_A(pk_arr, i));
|
||||
}
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
pthread_join(kv_A(tid_arr, i), NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "Hybrid index total bytes: %fM\n", (double)all_bytes / 1024 / 1024);
|
||||
|
||||
double time_sava_hybrid_idx = realtime();
|
||||
save_hybrid_idx(idx_prefix, &pk_arr);
|
||||
time_sava_hybrid_idx = realtime() - time_sava_hybrid_idx;
|
||||
|
||||
fprintf(stderr, "Time save hybrid index: %fs\n", time_sava_hybrid_idx);
|
||||
|
||||
kv_destroy(pk_arr);
|
||||
kv_destroy(tid_arr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
170
hyb_idx.h
170
hyb_idx.h
|
|
@ -9,3 +9,173 @@
|
|||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
|
||||
///////////////////////////////////////////
|
||||
// 宏定义
|
||||
// 数据
|
||||
#define HYB_KMER_LEN 16
|
||||
#define HYB_KMER_MASK 4294967295ULL
|
||||
#define HYB_KMER_DATA_BYTES 5 // 5bytes
|
||||
#define HYB_KMER_DATA_MASK 1099511627775ULL // 5bytes
|
||||
#define HYB_KMER_DATA_TYPE_BITS 5
|
||||
#define HYB_KMER_DATA_TYPE_MASK 31
|
||||
#define HYB_HIT_THRESH 30
|
||||
#define HYB_NODE_SA_MASK 1099511627775ULL
|
||||
#define HYB_MAX_SEQ_LEN 301
|
||||
#define HYB_LEAF_NODE 7
|
||||
|
||||
// 节点类型
|
||||
#define HYB_BP_1 0
|
||||
#define HYB_BP_2 1
|
||||
#define HYB_BP_3 2
|
||||
#define HYB_BP_PATH 3
|
||||
|
||||
static const uint32_t ga_hybHitsMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // hits掩码
|
||||
static const uint32_t ga_hybOffMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // offset掩码
|
||||
static const uint32_t ga_hybLearnedMask[5] = {0, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, 0xFFFFFFFF}; // learned counts 掩码
|
||||
|
||||
// 函数
|
||||
#define _kmer_from_pos(bit_seq, start) (((*(uint64_t*)&bit_seq[start >> 2]) >> ((start & 3) << 1)) & HYB_KMER_MASK)
|
||||
|
||||
#define _reverse_range(rs, re, seq_len, new_range) Range new_range = {seq_len - (re), seq_len - (rs), (re) - (rs)};
|
||||
|
||||
#define _rev_ref(hyb, ref_pos) ((hyb->ref_len << 1) - 1 - (ref_pos))
|
||||
///////////////////////////////////////
|
||||
// 结构体
|
||||
|
||||
// hybrid-index数据结构
|
||||
typedef struct {
|
||||
uint8_t* kmer_data; // kmer数据
|
||||
uint8_t* index_data; // 二级索引数据
|
||||
uint8_t* sa; // 后缀数组
|
||||
uint8_t* ref_bits; // 2-bit编码的ref序列
|
||||
uint64_t ref_len; // ref长度
|
||||
} HybridIndex;
|
||||
|
||||
// Read序列需要的数据
|
||||
typedef struct {
|
||||
int len; // read长度
|
||||
uint8_t* seq; // 每个碱基占一个字节(0-A, 1-C, 2-G, 3-G, 4-N)
|
||||
uint8_t* rseq; // 反向互补序列
|
||||
uint8_t* for_bits; // 正向序列 2-bit编码
|
||||
uint8_t* back_bits; // 反向互补序列 2-bit编码
|
||||
int id; // for test;
|
||||
char* seqstr;
|
||||
} ReadSeq;
|
||||
|
||||
typedef kvec_t(ReadSeq) ReadSeqArr;
|
||||
|
||||
// seeding过程生成的SMEM
|
||||
typedef struct {
|
||||
int first_len;
|
||||
int seed_start; // 左闭右开区间
|
||||
int seed_end;
|
||||
int read_start; // 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
|
||||
int read_end;
|
||||
uint64_v ref_pos_arr;
|
||||
} HybSeed;
|
||||
|
||||
// smem数组
|
||||
typedef kvec_t(HybSeed) HybSeedArr;
|
||||
|
||||
// read区间,用来处理包含N的read
|
||||
// 左闭右开区间, 默认是[0, read_len), 如果有N, 则是[N_pos + 1, next_N_pos)
|
||||
typedef struct {
|
||||
int start;
|
||||
int end;
|
||||
int len;
|
||||
} Range;
|
||||
|
||||
#define _set_range(r, s, e) {(r).start = s; (r).end = e; (r).len = (r).end - (r).start;}
|
||||
|
||||
#define _range_equal(r1, r2) ((r1).start == (r2).start && (r1).end == (r2).end)
|
||||
|
||||
#define _range_cross(r1, r2) \
|
||||
(((r1).start < (r2).start && (r1).end < (r2).end) || ((r1).start > (r2).start && (r1).end > (r2).end))
|
||||
|
||||
// read区间数组
|
||||
typedef kvec_t(Range) RangeArr;
|
||||
|
||||
#define __check_add_seed(seed) \
|
||||
HybSeed* seed = kv_pushp(HybSeed, *seeds); \
|
||||
if (seeds->m > *seeds_cap) { \
|
||||
uint64_t i = *seeds_cap; \
|
||||
for (i = *seeds_cap; i < seeds->m; ++i) kv_init(seeds->a[i].ref_pos_arr); \
|
||||
*seeds_cap = seeds->m; \
|
||||
} \
|
||||
seed->ref_pos_arr.n = 0;
|
||||
|
||||
#define __add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
|
||||
__check_add_seed(seed_var); \
|
||||
kv_push(uint64_t, seed_var->ref_pos_arr, ref_pos); \
|
||||
seed_var->first_len = 0; \
|
||||
seed_var->seed_start = s_start; \
|
||||
seed_var->seed_end = s_end; \
|
||||
seed_var->read_start = read_range->start; \
|
||||
seed_var->read_end = read_range->end;
|
||||
|
||||
#define __check_add_seed_one_pos(seed_var, ref_pos, s_start, s_end) \
|
||||
if ((s_end) - (s_start) >= min_seed_len) { \
|
||||
__add_seed_one_pos(seed_var, ref_pos, s_start, s_end); \
|
||||
}
|
||||
|
||||
#define __copy_seed(src, dst) \
|
||||
do { \
|
||||
(dst).seed_start = (src).seed_start; \
|
||||
(dst).seed_end = (src).seed_end; \
|
||||
(dst).read_start = (src).read_start; \
|
||||
(dst).read_end = (src).read_end; \
|
||||
int i; \
|
||||
for (i = 0; i < (src).ref_pos_arr.n; ++i) kv_push(uint64_t, (dst).ref_pos_arr, (src).ref_pos_arr.a[i]); \
|
||||
} while (0)
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
// functions
|
||||
|
||||
// 加载hybrid index
|
||||
HybridIndex* load_hybrid_idx(const char* prefix);
|
||||
|
||||
// 创建正向反向互补bits
|
||||
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* forward_bits, uint8_t* re);
|
||||
|
||||
// 用于将seq和ref正向直接比较,返回匹配的长度
|
||||
int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len);
|
||||
|
||||
// 用于将seq和ref反向直接比较,返回匹配的长度
|
||||
int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos);
|
||||
|
||||
// 根据sa的行获取对应的ref position(小端模式)
|
||||
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row);
|
||||
|
||||
// 解析第一个节点, 返回后续对应的节点地址
|
||||
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
|
||||
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
|
||||
|
||||
// 解析后续的正常节点
|
||||
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
|
||||
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid);
|
||||
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
|
||||
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
|
||||
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
|
||||
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid);
|
||||
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
|
||||
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid);
|
||||
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset);
|
||||
|
||||
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||||
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match);
|
||||
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||||
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match);
|
||||
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
|
||||
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match);
|
||||
|
||||
// 用hybrid-index来寻找smem(seeding-1 & seeding-2)
|
||||
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
|
||||
HybSeedArr* seeds, int tid);
|
||||
|
||||
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
|
||||
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
|
||||
const int min_seed_len, HybSeedArr* seeds, int tid);
|
||||
|
||||
// for seeding-3
|
||||
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
|
||||
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid);
|
||||
|
|
|
|||
|
|
@ -12,9 +12,10 @@ Date : 2024/04/06
|
|||
#include "profiling.h"
|
||||
#include "debug.h"
|
||||
|
||||
uint64_t proc_freq = 1000;
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
uint64_t tprof[LIM_THREAD_PROF_TYPE][LIM_THREAD] = {0};
|
||||
uint64_t proc_freq = 1000;
|
||||
uint64_t gprof[LIM_GLOBAL_PROF_TYPE] = {0};
|
||||
#endif
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue