hyb-align/hyb_utils.c

820 lines
49 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <assert.h>
#include <emmintrin.h>
#include <sys/stat.h>
#include "hyb_idx.h"
#include "share_mem.h"
#include "utils.h"
/////////////////////////////////////////////////////
// 使用hybrid-index的工具函数
/*
// 加载hybrid index
HybridIndex* load_hybrid_idx(const char* prefix) {
HybridIndex* hyb = NULL;
hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
// return hyb;
int prefix_len = strlen(prefix);
char* fn = (char*)malloc(prefix_len + 30);
FILE* fp = NULL;
struct stat st;
#define __load_hybrid_idx_code(suffix, data) \
sprintf(fn, "%s%s", prefix, suffix); \
err_check_true(stat(fn, &st), 0, fn); \
fp = xopen(fn, "r"); \
data = (uint8_t*)malloc(st.st_size); \
err_fread_noeof(data, 1, st.st_size, fp); \
err_fclose(fp);
// load ref-len
sprintf(fn, "%s.ref-len", prefix);
// fprintf(stderr, "fn: %s\n", fn);
fp = xopen(fn, "r");
err_check_false(fscanf(fp, "%ld", &hyb->ref_len), EOF);
err_fclose(fp);
// fprintf(stderr, "ref-len: %ld\n", hyb->ref_len);
char* kmer_suffix = ".hybrid.kmer";
char* data_suffix = ".hybrid.data";
// char *kmer_suffix = ".hybrid.learned.kmer";
// char *data_suffix = ".hybrid.learned.data";
#if 0
// shm_clear_hyb();
// load 2-bit ref
__load_hybrid_idx_code(".hybrid.pac", hyb->ref_bits);
// load hyb byte-sa
__load_hybrid_idx_code(".hybrid.sa", hyb->sa);
// load hyb kmer data
__load_hybrid_idx_code(kmer_suffix, hyb->kmer_data);
// load hyb index data
__load_hybrid_idx_code(data_suffix, hyb->index_data);
#else
shm_keep_hyb(prefix);
// load 2-bit ref
hyb->ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.pac"));
// load hyb byte-sa
hyb->sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.sa"));
// load hyb kmer data
hyb->kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), kmer_suffix));
// load hyb index data
hyb->index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), data_suffix));
#endif
// fprintf(stderr, "文件大小为: %ld 字节, %.2f GB\n", st.st_size, (double)st.st_size / (1024 * 1024 * 1024));
return hyb;
}
*/
// 创建正向反向互补bits
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* fs, uint8_t* rs) {
if (len > 0) {
uint8_t fbp = 0, rbp = 0;
int i = 0, j = 0, idxf = 0, idxr = 0;
for (; i + 3 < len; i += 4) {
fbp = (bs[i] & 3) | (bs[i + 1] & 3) << 2 | (bs[i + 2] & 3) << 4 | (bs[i + 3] & 3) << 6;
rbp = (3 - (bs[len - i - 1] & 3)) | (3 - (bs[len - i - 2] & 3)) << 2 | (3 - (bs[len - i - 3] & 3)) << 4 |
(3 - (bs[len - i - 4] & 3)) << 6;
fs[idxf++] = fbp;
rs[idxr++] = rbp;
}
fbp = 0;
rbp = 0;
for (; i < len; ++i, ++j) {
fbp |= (bs[i] & 3) << j * 2;
rbp |= (3 - (bs[len - i - 1] & 3)) << j * 2;
}
fs[idxf++] = fbp;
rs[idxr++] = rbp;
}
}
// 将seq和ref正向比对看最多有多少匹配的bpseq和ref都是2-bit编码的
inline int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len) {
if (seq_pos >= seq_end)
return 0;
int64_t max_match_len = MIN(ref_len - ref_pos, seq_end - seq_pos);
int ref_odd = ref_pos & 3;
int seq_odd = seq_pos & 3;
int64_t i = seq_pos;
int64_t j = ref_pos;
int match_len = 0;
/////////////
#define __forward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \
uint64_t bp32ref = first_ref; \
uint64_t bp32seq = first_seq; \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(__builtin_ctzll(cmp) >> 1, max_match_len); \
int first_cmp_len = first_len; \
match_len = MIN(first_cmp_len, max_match_len); \
i += first_cmp_len; \
j += first_cmp_len; \
seq_odd = i & 3; \
ref_odd = j & 3; \
for (; i + 31 < seq_end; i += 32, j += 32, match_len += 32) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + (__builtin_ctzll(cmp) >> 1), max_match_len); \
} \
if (i < seq_end) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + MIN(__builtin_ctzll(cmp) >> 1, seq_end - i), max_match_len); \
match_len = max_match_len; /*match_len += seq_end - i;*/ \
}
/////////
if (seq_odd < ref_odd) { // 调整到ref的整数字节
__forward_match_code(32 - ref_odd, (*(uint64_t*)&ref[j >> 2]) >> (ref_odd << 1),
(*(uint64_t*)&seq[i >> 2]) << ((ref_odd - seq_odd) << 1) >> (ref_odd << 1),
(*(uint64_t*)&ref[j >> 2]),
seq[i >> 2] >> (seq_odd << 1) | (*(uint64_t*)&seq[(i >> 2) + 1]) << ((4 - seq_odd) << 1));
} else if (seq_odd > ref_odd) { // 调整到seq的整数字节
__forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) << ((seq_odd - ref_odd) << 1) >> (seq_odd << 1),
(*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1),
ref[j >> 2] >> (ref_odd << 1) | (*(uint64_t*)&ref[(j >> 2) + 1]) << ((4 - ref_odd) << 1),
(*(uint64_t*)&seq[i >> 2]));
} else { // 可以调整到相同的64位地址进行比较了
__forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) >> (seq_odd << 1),
(*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1), (*(uint64_t*)&ref[j >> 2]),
(*(uint64_t*)&seq[i >> 2]));
}
return MIN(match_len, max_match_len);
}
// 将seq和ref反向比对看最多有多少匹配的bp
inline int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos) {
if (seq_pos < seq_start)
return 0;
int64_t max_match_len = MIN(ref_pos + 1, seq_pos - seq_start + 1);
int64_t i = seq_pos;
int64_t j = ref_pos;
int seq_odd = 3 - (i & 3);
int ref_odd = 3 - (j & 3);
int match_len = 0;
/////////////
#define __backward_tail_code(last_code) \
int ext_bp = (7 - (i >> 2)) << 2; \
uint64_t bp32ref = *(uint64_t*)(ref + (j >> 2) - 8) >> ((4 - ref_odd) << 1) | (uint64_t)ref[j >> 2] \
<< ((ref_odd + 28) << 1); \
uint64_t bp32seq = (*(uint64_t*)seq) << ((seq_odd + ext_bp) << 1); \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \
last_code
#define __backward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \
uint64_t bp32ref = first_ref; \
uint64_t bp32seq = first_seq; \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \
int first_cmp_len = first_len; \
match_len = MIN(first_cmp_len, max_match_len); \
i -= first_cmp_len; \
j -= first_cmp_len; \
seq_odd = 3 - (i & 3); \
ref_odd = 3 - (j & 3); \
for (; i - 31 >= 0; i -= 32, j -= 32, match_len += 32) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + (__builtin_clzll(cmp) >> 1), max_match_len); \
} \
if (i >= seq_start) { \
__backward_tail_code(match_len = max_match_len); \
}
////////////
if (i < 32) { // 只需要一次比较
__backward_tail_code(return max_match_len);
}
if (seq_odd < ref_odd) { // 调整到ref的整数字节
__backward_match_code(
32 - ref_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (ref_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) >> ((ref_odd - seq_odd) << 1) << (ref_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]),
(*(uint64_t*)&seq[(i >> 2) - 8] >> ((4 - seq_odd) << 1)) | ((uint64_t)seq[(i >> 2)] << ((seq_odd + 28) << 1)));
} else if (seq_odd > ref_odd) { // 调整到seq的整数字节
__backward_match_code(
32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) >> ((seq_odd - ref_odd) << 1) << (seq_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1),
(*(uint64_t*)&ref[(j >> 2) - 8] >> ((4 - ref_odd) << 1)) | ((uint64_t)ref[(j >> 2)] << ((ref_odd + 28) << 1)),
(*(uint64_t*)&seq[(i >> 2) - 7]));
} else { // 可以调整到相同的64位地址进行比较了
__backward_match_code(32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (seq_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]),
(*(uint64_t*)&seq[(i >> 2) - 7]));
}
return MIN(match_len, max_match_len);
}
// 根据sa的行获取对应的ref position小端模式
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row) {
const uint64_t start_byte = ((row << 5) + row) >> 3; // 存储这个sa数据的起始字节
uint64_t val = *(uint64_t*)(sa_arr + start_byte);
val = (val >> (row & 7)) & 8589934591ULL; // 33-bits mask
return val;
}
#define __parse_node_start_no_addr(idata) \
*cmp_ref = 1; \
uint32_t seq_pos = *seq_pos_p; \
uint8_t header = *idata; \
idata += 1; \
uint8_t node_type = (header >> 6) & 3; \
uint8_t hits_neq = header >> 5 & 1; \
uint32_t hits_bytes = ((header >> 3) & 3) + 1; \
uint32_t off_bytes = header & 7; \
uint32_t child_ptr_bytes = hits_bytes + off_bytes;
// 解析hyb node初始化变量信息
#define __parse_node_start_code(idata) \
uint8_t* addr = NULL; \
__parse_node_start_no_addr(idata)
// 解析单一碱基序列节点
#define __parse_path_node_code(path_len) \
uint32_t path_len = (header & 1); \
path_len = path_len << 8 | *idata; \
idata += 1; \
int match_len = forward_match_len(seq_bits, seq_pos, seq_end, idata, 0, path_len); \
*seq_pos_p = seq_pos + match_len; \
if (match_len == (int)path_len) { \
addr = idata + (((path_len << 1) + 7) >> 3); \
if (hits_neq) { \
*sa_start_p += 1; \
*hits_p -= 1; \
} \
} else \
*cmp_ref = 0;
// 解析正常hyb节点
#define __parse_child_node_code(kmer_len, mark_bytes, int_type, kmer_code, bits_count, one) \
uint8_t kmer = kmer_code; \
int_type mark = *(int_type*)idata; \
int_type child_num = mark & (one << kmer); \
if (child_num) { \
*seq_pos_p += kmer_len; \
uint32_t nth_child = bits_count(mark & ((one << kmer) - 1)); \
uint8_t has_next_child = bits_count(mark >> kmer >> 1); \
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) { \
*cmp_ref = 0; \
} \
if (off_bytes == HYB_LEAF_NODE) { \
*hits_p -= nth_child + hits_neq + has_next_child; \
*sa_start_p += nth_child + hits_neq; \
} else { \
if (nth_child == 0) { \
idata += mark_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
addr = idata + has_next_child * child_ptr_bytes; \
*hits_p = hits_start - hits_neq; \
*sa_start_p += hits_neq; \
} else { \
idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes]; \
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes; \
if (has_next_child) { \
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start + hits_neq; \
} \
*sa_start_p += hits_start; \
} \
} \
} else { \
*cmp_ref = 0; \
}
// 当节点不能完全匹配时候,检查是否能匹配该节点包含的部分碱基序列
#define __parse_part_node_code(kmer_len, mark_bytes, int_type, kmer_base_code, bits_range, bits_count, one) \
uint8_t kmer_base = kmer_base_code; \
int_type mark = *(int_type*)idata; \
int_type kmer_mask = ((one << bits_range) - 1) << kmer_base; \
int_type child_num = mark & kmer_mask; \
if (child_num) { \
*seq_pos_p += kmer_len; \
int_type kmer_pre_mask = (one << kmer_base) - 1; \
uint32_t nth_child = bits_count(mark & kmer_pre_mask); \
uint8_t has_next_child = bits_count(mark >> kmer_base >> bits_range); \
if (off_bytes == HYB_LEAF_NODE) { \
*hits_p -= nth_child + hits_neq + has_next_child; \
*sa_start_p += nth_child + hits_neq; \
} else { \
if (nth_child == 0) { \
idata += mark_bytes; \
uint32_t hits_start = hits_neq; \
if (has_next_child) { \
child_num = bits_count(child_num); \
*hits_p = \
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start; \
} \
*sa_start_p += hits_start; \
} else { \
idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
*sa_start_p += hits_start; \
if (has_next_child) { \
child_num = bits_count(child_num); \
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start + hits_neq; \
} \
} \
} \
} \
*cmp_ref = 0;
// 解析节点主要代码
#define __parse_hyb_node_code(return_code) \
if (node_type == HYB_BP_PATH) { \
__parse_path_node_code(path_len); \
} else if (node_type == HYB_BP_1) { \
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1); \
} else if (node_type == HYB_BP_2) { \
if (seq_pos + 1 < seq_end) { \
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1); \
if (!child_num) { \
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \
} \
} else { \
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \
} \
} else { \
if (seq_pos + 2 < seq_end) { \
__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2], \
__builtin_popcountll, 1ULL); \
if (!child_num) { \
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, \
__builtin_popcountll, 1ULL); \
if (!child_num) { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} \
} else if (seq_pos + 1 < seq_end) { \
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, \
1ULL); \
if (!child_num) { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} else { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} \
/*__builtin_prefetch(addr, 0, 3); */ \
return_code
/////////
// 解析第一个节点, 返回后续对应的节点地址
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) {
if (*seq_pos_p == seq_end)
return NULL;
__parse_node_start_code(idata);
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
// __parse_hyb_node_code(return addr);
if (node_type == HYB_BP_PATH) {
__parse_path_node_code(path_len);
} else if (node_type == HYB_BP_1) {
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1);
} else if (node_type == HYB_BP_2) {
if (seq_pos + 1 < seq_end) {
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1);
if (!child_num) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
if (seq_pos + 2 < seq_end) {
//__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2],
// __builtin_popcountll, 1ULL);
uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2];
uint64_t mark = *(uint64_t*)idata;
uint64_t child_num = mark & (1ULL << kmer);
if (child_num) {
*seq_pos_p += 3;
uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1));
uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1);
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) {
*cmp_ref = 0;
}
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
addr = idata + has_next_child * child_ptr_bytes;
*hits_p = hits_start - hits_neq;
*sa_start_p += hits_neq;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes];
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes;
if (has_next_child) {
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
*sa_start_p += hits_start;
}
}
} else {
*cmp_ref = 0;
}
if (!child_num) {
//__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4,
//__builtin_popcountll,
// 1ULL);
uint8_t kmer_base = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2;
uint64_t mark = *(uint64_t*)idata;
uint64_t kmer_mask = ((1ULL << 4) - 1) << kmer_base;
uint64_t child_num = mark & kmer_mask;
if (child_num) {
*seq_pos_p += 2;
uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1;
uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask);
uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 4);
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = hits_neq;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p =
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start;
}
*sa_start_p += hits_start;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
*sa_start_p += hits_start;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
}
}
}
*cmp_ref = 0;
if (!child_num) {
// __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
uint8_t kmer_base = seq_bp[seq_pos] << 4;
uint64_t mark = *(uint64_t*)idata;
uint64_t kmer_mask = ((1ULL << 16) - 1) << kmer_base;
uint64_t child_num = mark & kmer_mask;
if (child_num) {
*seq_pos_p += 1;
uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1;
uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask);
uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 16);
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = hits_neq;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p =
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start;
}
*sa_start_p += hits_start;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
*sa_start_p += hits_start;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
}
}
}
*cmp_ref = 0;
}
}
} else if (seq_pos + 1 < seq_end) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
return addr;
}
// 解析后续的正常节点
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) {
if (*seq_pos_p == seq_end)
return NULL;
__parse_node_start_code(idata);
// __parse_hyb_node_code(return addr);
if (node_type == HYB_BP_PATH) {
__parse_path_node_code(path_len);
} else if (node_type == HYB_BP_1) {
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1);
} else if (node_type == HYB_BP_2) {
if (seq_pos + 1 < seq_end) {
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1);
if (!child_num) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
if (seq_pos + 2 < seq_end) {
//__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2],
// __builtin_popcountll, 1ULL);
uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2];
uint64_t mark = *(uint64_t*)idata;
uint64_t child_num = mark & (1ULL << kmer);
if (child_num) {
*seq_pos_p += 3;
uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1));
uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1);
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) {
*cmp_ref = 0;
}
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
addr = idata + has_next_child * child_ptr_bytes;
*hits_p = hits_start - hits_neq;
*sa_start_p += hits_neq;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes];
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes;
if (has_next_child) {
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
*sa_start_p += hits_start;
}
}
} else {
*cmp_ref = 0;
}
if (!child_num) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
} else if (seq_pos + 1 < seq_end) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
return addr;
}
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) {
if (*seq_pos_p == seq_end)
return;
uint8_t cmp_ref_val = 0;
uint8_t* cmp_ref = &cmp_ref_val;
__parse_node_start_no_addr(idata);
if (is_head) {
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
}
uint8_t* prev_idata = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
if (node_type == HYB_BP_2) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
}
} else if (node_type == HYB_BP_3) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
} else if (seq_pos + 1 < seq_end) {
uint32_t pp_seq_pos = prev_seq_pos;
uint32_t pp_hits = prev_hits;
uint64_t pp_sa_start = prev_sa_start;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
*seq_pos_p = pp_seq_pos;
*hits_p = pp_hits;
*sa_start_p = pp_sa_start;
idata = prev_idata; // 恢复到上一个节点
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
}
}
}
}
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) {
if (*seq_pos_p == seq_end)
return;
uint8_t cmp_ref_val = 0;
uint8_t* cmp_ref = &cmp_ref_val;
__parse_node_start_no_addr(idata);
if (is_head) {
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
}
uint8_t* prev_idata = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
if (node_type == HYB_BP_2) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
} else if (node_type == HYB_BP_3) {
if (min_bp == 2) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
if (*hits_p >= max_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
idata = prev_idata; // 恢复到上一个节点
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
}
}
} else { // path node
if (min_bp > 0)
*seq_pos_p += min_bp;
}
}
// 需要给定初始化的hits和seq_pos
#define CALC_STAT 0
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid) {
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
uint8_t* prev_addr = idata;
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p > 1) {
#if CALC_STAT
prev_addr = next_addr;
#endif
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
}
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset) {
uint64_t kmer = _kmer_from_pos(seq_bits, kmer_pos);
uint8_t* kmer_data_addr = hyb->kmer_data + kmer * HYB_KMER_DATA_BYTES;
*type_hits = *kmer_data_addr & HYB_KMER_DATA_TYPE_MASK;
*offset = (*(uint64_t*)kmer_data_addr & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS;
}
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match) {
if (ref_pos < hyb->ref_len) {
*right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits,
ref_pos + init_match_len, hyb->ref_len);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end,
hyb->ref_bits, ref_pos - init_match_len);
}
*right_match += init_match_len; // 包括kmer的长度
}
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match) {
if (ref_pos < hyb->ref_len) {
*left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits,
ref_pos + 1, hyb->ref_len);
}
}
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match) {
if (ref_pos < hyb->ref_len) {
*right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits,
ref_pos + init_match_len, hyb->ref_len);
*left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end,
hyb->ref_bits, ref_pos - init_match_len);
*left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits,
ref_pos + 1, hyb->ref_len);
}
*right_match += init_match_len; // 包括kmer的长度
}