sw benchmark,对extend部分做了全部的优化,去除target倒序,减少FIND_MAX次数

This commit is contained in:
zzh 2024-04-11 13:29:28 +08:00
parent d16f8487ce
commit 845a1156b8
36 changed files with 2484 additions and 1200 deletions

View File

@ -16,6 +16,13 @@
"initializer_list": "c",
"__hash_table": "c",
"ios": "c",
"iterator": "c"
"iterator": "c",
"compare": "c",
"tuple": "c",
"type_traits": "c",
"utility": "c",
"byte_alloc.h": "c",
"debug.h": "c",
"limits": "c"
}
}

View File

@ -8,12 +8,9 @@ PROG2= get_line
INCLUDES=
LIBS=
SUBDIRS= .
OBJS= normal.o \
normal_pruning.o \
avx2_u8.o \
avx2_u8_pruning.o \
thread_mem.o \
utils.o
SRCS= $(wildcard *.c)
EXCLUDES= get_line.c
OBJS= $(patsubst %.c, %.o, $(filter-out $(EXCLUDES), $(SRCS)))
ifeq ($(shell uname -s),Linux)
@ -27,16 +24,14 @@ endif
all:$(PROG)
$(PROG):$(OBJS) main.o
$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) main.o -o $@ -L. $(LIBS)
$(PROG):$(OBJS)
$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) -o $@ -L. $(LIBS)
$(PROG2): get_line.o
$(CC) $(CFLAGS) $(LDFLAGS) get_line.o -o $@ -L. $(LIBS)
clean:
rm -f *.o a.out $(PROG) *~ *.a
rm -f *.o a.out $(PROG) $(PROG2) *~ *.a
depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )
# DO NOT DELETE THIS LINE -- make depend depends on it.
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )

View File

@ -1,3 +1,4 @@
# sw_perf
- 对bwa的sw extend部分代码进行抽取做成benchmark
- 对bwa的sw相关代码进行抽取和优化实现做成benchmark
- bwa有三个部分包含sw算法分别是seed-extension阶段生成sam阶段的计算pair-end align分值以及最后生成cigar的global
- 实现avx2和cuda版本并进行性能对比

0
align.c 100644
View File

View File

@ -10,16 +10,6 @@
#define __COMMON_H
#include <stdio.h>
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define KERNEL_NUM 4
extern FILE *ins_ext_f_arr[KERNEL_NUM],
*del_ext_f_arr[KERNEL_NUM],
*score_f_arr[KERNEL_NUM],
*retval_f_arr[KERNEL_NUM];
#endif

0
align_avx2_i16.c 100644
View File

0
align_avx2_u8.c 100644
View File

0
align_sse_i16.c 100644
View File

0
align_sse_u8.c 100644
View File

View File

@ -1,507 +0,0 @@
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <emmintrin.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "thread_mem.h"
#include "common.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define KSW_EQUAL
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define SIMD_WIDTH 32
typedef struct
{
size_t m;
uint8_t *addr;
} buf_t;
static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
// static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
__m256i reverse_mask_vec; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi8(oe_del); \
oe_ins_vec = _mm256_set1_epi8(oe_ins); \
e_del_vec = _mm256_set1_epi8(e_del); \
e_ins_vec = _mm256_set1_epi8(e_ins); \
__m256i match_sc_vec = _mm256_set1_epi8(a); \
__m256i mis_sc_vec = _mm256_set1_epi8(b); \
__m256i amb_sc_vec = _mm256_set1_epi8(1); \
__m256i amb_vec = _mm256_set1_epi8(4); \
reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \
for (i = 0; i < SIMD_WIDTH; ++i) \
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
/*
* e ref
* f seq
* m
* h
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j - 1])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shuffle_epi8(ts_vec, reverse_mask_vec); \
__m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi8(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi8(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
mis_score_vec = _mm256_andnot_si256(amb_mask_vec, mis_score_vec); \
mis_score_vec = _mm256_or_si256(amb_score_vec, mis_score_vec); \
match_score_vec = _mm256_andnot_si256(amb_mask_vec, match_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_max_epu8(m1, oe_del_vec); \
en_vec0 = _mm256_subs_epu8(en_vec0, oe_del_vec); \
__m256i en_vec1 = _mm256_max_epu8(e1, e_del_vec); \
en_vec1 = _mm256_subs_epu8(en_vec1, e_del_vec); \
__m256i en_vec = _mm256_max_epu8(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_max_epu8(m1j1, oe_ins_vec); \
fn_vec0 = _mm256_subs_epu8(fn_vec0, oe_ins_vec); \
__m256i fn_vec1 = _mm256_max_epu8(f1j1, e_ins_vec); \
fn_vec1 = _mm256_subs_epu8(fn_vec1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epu8(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_adds_epu8(h0j1, match_score_vec); \
mn_vec0 = _mm256_max_epu8(mn_vec0, mis_score_vec); \
mn_vec0 = _mm256_subs_epu8(mn_vec0, mis_score_vec); \
__m256i mn_mask = _mm256_cmpeq_epi8(h0j1, zero_vec); \
__m256i mn_vec = _mm256_andnot_si256(mn_mask, mn_vec0); \
__m256i hn_vec0 = _mm256_max_epu8(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epu8(hn_vec0, mn_vec);
// 存储向量化结果
#define SIMD_STORE \
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i *)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
uint8_t *maxVal = (uint8_t *)&max_vec; \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
m = MAX(m, maxVal[0]); \
if (maxVal[0] > 0) \
{ \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
} \
} \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
uint8_t *tmp = hA0; \
hA0 = hA1; \
hA1 = hA2; \
hA2 = tmp; \
tmp = eA1; \
eA1 = eA2; \
eA2 = tmp; \
tmp = fA1; \
fA1 = fA2; \
fA2 = tmp; \
tmp = mA1; \
mA1 = mA2; \
mA2 = tmp;
int avx2_u8_pruning(thread_mem_t *tmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值)
{
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = seq_size + ref_size + val_mem_size;
int is_left = 0; // 是不是向左扩展
int a = 1; // 碱基match时的分数
int b = 4; // 碱基mismatch时的惩罚分数正数
SIMD_INIT; // 初始化simd用的数据
assert(h0 > 0);
// allocate memory
mem = malloc(mem_size);
// if (buf->m < mem_size)
//{
// buf->m = mem_size;
// buf->addr = realloc(buf->addr, mem_size);
// }
// mem = buf->addr;
qtmem = &mem[0];
seq = (uint8_t *)&qtmem[0];
ref = (uint8_t *)&qtmem[seq_size];
if (is_left)
{
for (i = 0; i < qlen; ++i)
seq[i] = query[qlen - 1 - i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[tlen - 1 - i];
}
else
{
for (i = 0; i < qlen; ++i)
seq[i] = query[i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i];
}
vmem = &ref[ref_size];
for (i = 0; i < val_mem_size; i += SIMD_WIDTH)
{
_mm256_storeu_si256((__m256i *)&vmem[i], zero_vec);
}
hA = &vmem[0];
mA = &vmem[col_size * 3];
eA = &vmem[col_size * 5];
fA = &vmem[col_size * 7];
hA0 = &hA[0];
hA1 = &hA[col_size];
hA2 = &hA1[col_size];
mA1 = &mA[0];
mA2 = &mA[col_size];
eA1 = &eA[0];
eA2 = &eA[col_size];
fA1 = &fA[0];
fA2 = &fA[col_size];
// adjust $w if it is too large
k = m * m;
// get the max score
for (i = 0, max = 0; i < k; ++i)
max = max > mat[i] ? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1 ? max_ins : 1;
w = w < max_ins ? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1 ? max_del : 1;
w = w < max_del ? w : max_del; // TODO: is this necessary?
if (tlen < qlen)
w = MIN(tlen - 1, w);
// DP loop
max = h0, max_i = max_j = -1;
max_ie = -1, gscore = -1;
;
max_off = 0;
beg = 1;
end = qlen;
// init h0
hA0[0] = h0; // 左上角
if (qlen == 0 || tlen == 0)
Dloop = 0; // 防止意外情况
if (w >= qlen)
{
max_ie = 0;
gscore = 0;
}
int m_last = 0;
int iend;
#ifdef KSW_EQUAL
int midx = 1, icheck = 0, checkspecial = 1;
int m3 = 0, m2 = 0, m1 = 0;
#endif
for (D = 1; LIKELY(D < Dloop); ++D)
{
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
{
span = MIN(Dloop - D, w);
beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1);
}
else
{
span = MIN(D - 1, w);
beg1 = MAX(1, ((D - w) / 2) + 1);
}
end1 = MIN(qlen, beg1 + span);
if (beg < beg1)
beg = beg1;
if (end > end1)
end = end1;
if (beg > end)
break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
iend = D - (beg - 1); // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span - 1; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
// 要处理边界
// 左边界 处理f (insert)
if (ibeg == 0)
{
hA1[end] = MAX(0, h0 - (o_ins + e_ins * end));
m = hA1[end];
}
// 上边界
if (beg == 1)
{
hA1[0] = MAX(0, h0 - (o_del + e_del * iend));
}
else if (D & 1)
{
hA1[beg - 1] = 0;
hA2[beg - 1] = 0;
}
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
{
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end)
{
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
SIMD_FIND_MAX;
#ifdef KSW_EQUAL
if (hA1[0] < 4 && checkspecial)
{ // b == 4
if (hA1[0] == 3)
{
icheck = iend + 1;
}
else if (midx == 2)
{
m2 = MAX(m2, hA2[midx - 1]);
}
else
{
m2 = MAX(m2, hA2[midx - 1]);
m1 = MAX(m1, hA2[midx - 2]);
}
m3 = MAX(m3, hA2[midx]);
midx += 1;
if (ibeg > icheck)
{
if (!m1 || !m2 || !m3)
break;
else
checkspecial = 0;
}
}
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1)
{
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m == 0 && m_last == 0)
break; // 一定要注意,斜对角遍历和按列遍历的不同点
if (m > max)
{
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (m == max && max_i >= mi && mj > max_j)
{
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0)
{
#if 0
if (mi - max_i > mj - max_j)
{
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop)
break;
}
else
{
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop)
break;
}
#endif
}
// 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
else
hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
free(mem);
if (_qle)
*_qle = max_j + 1;
if (_tle)
*_tle = max_i + 1;
if (_gtle)
*_gtle = max_ie + 1;
if (_gscore)
*_gscore = gscore;
if (_max_off)
*_max_off = max_off;
return max;
}

20
bsw.h
View File

@ -1,20 +0,0 @@
/*********************************************************************************************
Description: Declarations of sw extend functions
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/23
***********************************************************************************************/
#ifndef __BSW_H
#define __BSW_H
#include <stdint.h>
typedef struct _thread_mem_t thread_mem_t;
int normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int avx2_u8(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int avx2_u8_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
#endif

102
byte_alloc.c 100644
View File

@ -0,0 +1,102 @@
/*********************************************************************************************
Description: byte memory allocation with boundary aligned
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/23
***********************************************************************************************/
#include "byte_alloc.h"
#include <stdio.h>
#include <string.h>
// 创建
byte_mem_t *create_byte_mem()
{
byte_mem_t *bmem = (byte_mem_t *)malloc(sizeof(byte_mem_t));
bmem->occupied = bmem->capacity = 0;
bmem->mem = 0;
return bmem;
}
// 初始化
void init_byte_mem(byte_mem_t *bmem)
{
bmem->occupied = bmem->capacity = 0;
bmem->mem = 0;
}
// 初始化并开辟一定量的内存
void byte_mem_init_alloc(byte_mem_t *bmem, size_t byte_cnt)
{
bmem->capacity = byte_cnt;
bmem->mem = malloc(bmem->capacity);
bmem->occupied = 0;
}
// 请求内存
void *byte_mem_request(byte_mem_t *bmem, size_t byte_cnt)
{
// fprintf(stderr, "capacity:%ld, occupied: %ld, byte_cnt: %ld\n", bmem->capacity, bmem->occupied, byte_cnt);
void *ret_mem = 0;
if (bmem == 0)
{
ret_mem = 0;
}
else if (bmem->capacity == 0)
{
bmem->capacity = byte_cnt;
bmem->mem = malloc(bmem->capacity);
bmem->occupied = byte_cnt;
ret_mem = bmem->mem;
}
else if (bmem->capacity - bmem->occupied >= byte_cnt)
{
ret_mem = bmem->mem + bmem->occupied;
bmem->occupied += byte_cnt;
}
else
{
bmem->capacity = bmem->occupied + byte_cnt;
bmem->mem = realloc(bmem->mem, bmem->capacity);
ret_mem = bmem->mem + bmem->occupied;
bmem->occupied += byte_cnt;
}
return ret_mem;
}
void *byte_mem_request_and_clean(byte_mem_t *bmem, size_t byte_cnt)
{
void *mem = byte_mem_request(bmem, byte_cnt);
memset(mem, 0, byte_cnt);
return mem;
}
// 将不用的内存归还给byte mem
void byte_mem_release(byte_mem_t *bmem, size_t byte_cnt)
{
bmem->occupied -= byte_cnt;
}
void byte_mem_clear(byte_mem_t *bmem)
{
bmem->occupied = 0;
}
// 彻底释放内存
void byte_mem_free(byte_mem_t *bmem)
{
bmem->capacity = bmem->occupied = 0;
free(bmem->mem);
bmem->mem = 0;
}
// 销毁byte_mem_t
void destroy_byte_mem(byte_mem_t *bmem)
{
if (bmem != 0)
{
byte_mem_free(bmem);
free(bmem);
}
}

View File

@ -1,5 +1,5 @@
/*********************************************************************************************
Description: In-thread memory allocation with boundary aligned
Description: Byte memory allocation with boundary aligned
Copyright : All right reserved by NCIC.ICT
@ -16,33 +16,36 @@
#define MEM_ALIGN_BYTE 8
#define MEM_MOVE_BIT 3
typedef struct _thread_mem_t
typedef struct _byte_mem_t
{
size_t occupied; // 已经占用的容量(字节数) 对齐的
size_t capacity; // 总容量(字节数)
void *mem; // 申请的内存首地址
} thread_mem_t;
} byte_mem_t;
// 创建thread_mem_t
thread_mem_t *create_thread_mem();
// 创建byte_mem_t
byte_mem_t *create_byte_mem();
// 初始化
void init_thread_mem(thread_mem_t *tmem);
void init_byte_mem(byte_mem_t *bmem);
// 初始化并开辟一定量的内存
void thread_mem_init_alloc(thread_mem_t *tmem, size_t byte_cnt);
void byte_mem_init_alloc(byte_mem_t *bmem, size_t byte_cnt);
// 请求内存
void *thread_mem_request(thread_mem_t *tmem, size_t byte_cnt);
void *byte_mem_request(byte_mem_t *bmem, size_t byte_cnt);
// 请求内存并初始化为零
void *thread_mem_request_and_clean(thread_mem_t *tmem, size_t byte_cnt);
void *byte_mem_request_and_clean(byte_mem_t *bmem, size_t byte_cnt);
// 将不用的内存归还给thread mem
void thread_mem_release(thread_mem_t *tmem, size_t byte_cnt);
// 将不用的内存归还给byte mem
void byte_mem_release(byte_mem_t *bmem, size_t byte_cnt);
// 清空占用量
void byte_mem_clear(byte_mem_t *bmem);
// 彻底释放内存
void thread_mem_free(thread_mem_t *tmem);
void byte_mem_free(byte_mem_t *bmem);
// 销毁thread_mem_t
void destroy_thread_mem(thread_mem_t *tmem);
// 销毁byte_mem_t
void destroy_byte_mem(byte_mem_t *bmem);
#endif

50
debug.c 100644
View File

@ -0,0 +1,50 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#include "debug.h"
FILE *gf[4] = {0},
*gfq[4] = {0},
*gft[4] = {0},
*gfi[4] = {0};
int open_qti_files()
{
char fn[1024] = {0};
int i = 0;
for (; i < 4; ++i)
{
sprintf(fn, "./output/q%d.txt", i); gfq[i] = fopen(fn, "w");
sprintf(fn, "./output/t%d.txt", i); gft[i] = fopen(fn, "w");
sprintf(fn, "./output/i%d.txt", i); gfi[i] = fopen(fn, "w");
}
return 0;
}
int open_debug_files()
{
char fn[1024] = {0};
int i = 0;
for (; i < 4; ++i) {
sprintf(fn, "./output/fp%d.txt", i);
gf[i] = fopen(fn, "w");
}
return 0;
}
int close_files()
{
int i = 0;
for (; i < 4; ++i) {
if (gf[i] != 0) fclose(gf[i]);
if (gfq[i] != 0) fclose(gfq[i]);
if (gft[i] != 0) fclose(gft[i]);
if (gfi[i] != 0) fclose(gfi[i]);
}
return 0;
}

18
debug.h 100644
View File

@ -0,0 +1,18 @@
/*********************************************************************************************
Description: data and files for debugging
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/09
***********************************************************************************************/
#include <stdio.h>
#define DEBUG_FILE_OUTPUT
extern FILE *gf[4], *gfq[4], *gft[4], *gfi[4];
int open_qti_files();
int open_debug_files();
int close_files();

94
extend.c 100644
View File

@ -0,0 +1,94 @@
/*********************************************************************************************
Description: sw extend functions in bwa-mem
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "byte_alloc.h"
#include "utils.h"
#include "profiling.h"
#include "extend.h"
#include "debug.h"
#define EXTEND_PERFORMANCE_TEST(kernel_id, func, sp, ep) \
do \
{ \
PROF_START(extend); \
int i, score; \
for (i = sp; i < ep; ++i) \
{ \
score = func( \
&bmem, \
kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, \
kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \
5, mat, 6, 1, 6, 1, 100, 5, 100, \
kv_A(kv_A(i_arr, i), 2), \
&qle, &tle, &gtle, &gscore, &max_off[0]); \
score_total[kernel_id] += score; \
} \
PROF_END(gprof[kernel_prof_idx[kernel_id]], extend); \
} while (0)
/******
* query.fa, target.fa, info.txt
* query.fa: queryACGTN
* target.fa: referencetargetACGTN
* info.txt: querytargeth0
*/
int main_extend(int argc, char *argv[])
{
if (argc < 3) {
fprintf(stderr, "Need 3 files: query, target, info.\n");
return -1;
}
const char *qf_path = argv[0];
const char *tf_path = argv[1];
const char *if_path = argv[2];
FILE *qfp = fopen(qf_path, "r");
FILE *tfp = fopen(tf_path, "r");
FILE *ifp = fopen(if_path, "r");
buf_t read_buf = {0};
seq_v q_arr = {0};
seq_v t_arr = {0};
qti_v i_arr = {0};
uint64_t score_total[EXTEND_FUNC_NUM] = {0};
int query_read_row = read_seq(&q_arr, &read_buf, 3000000, qfp);
int target_read_row = read_seq(&t_arr, &read_buf, 3000000, tfp);
int info_read_row = read_qt_info(&i_arr, &read_buf, 3000000, 3, ifp);
// fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row);
int8_t mat[25] = {1, -4, -4, -4, -1,
-4, 1, -4, -4, -1,
-4, -4, 1, -4, -1,
-4, -4, -4, 1, -1,
-1, -1, -1, -1, -1};
int kernel_prof_idx[] = {G_EXT_SCALAR, G_EXT_AVX2_I16, G_EXT_AVX2_U8, G_EXT_AVX2_I16_SP};
byte_mem_t bmem = {0};
byte_mem_init_alloc(&bmem, 1024 * 1024);
int max_off[2], qle, tle, gtle, gscore;
int excute_lines = MIN(MIN(query_read_row, target_read_row), info_read_row);
//open_qti_files();
//open_debug_files();
fprintf(stderr, "excute nums: %d\n", excute_lines);
EXTEND_PERFORMANCE_TEST(0, extend_scalar, 0, excute_lines);
EXTEND_PERFORMANCE_TEST(1, extend_avx2_i16, 0, excute_lines);
EXTEND_PERFORMANCE_TEST(2, extend_avx2_u8, 0, excute_lines);
EXTEND_PERFORMANCE_TEST(3, extend_avx2_i16_sp, 0, excute_lines);
int i = 0; for(; i<ARRAY_SIZE(kernel_prof_idx); ++i) { gdata[kernel_prof_idx[i]] = score_total[i]; }
display_extend_stats();
close_files();
fclose(qfp);
fclose(tfp);
fclose(ifp);
return 0;
}

23
extend.h 100644
View File

@ -0,0 +1,23 @@
/*********************************************************************************************
Description: Declarations of sw extend functions
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/23
***********************************************************************************************/
#ifndef __EXTEND_H
#define __EXTEND_H
#include <stdint.h>
#define EXTEND_FUNC_NUM 4
typedef struct _byte_mem_t byte_mem_t;
int extend_scalar(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int extend_avx2_i16(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int extend_avx2_u8(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int extend_avx2_i16_sp(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
int main_extend(int argc, char *argv[]);
#endif

870
extend_avx2_i16.c 100644
View File

@ -0,0 +1,870 @@
/*********************************************************************************************
Description: sw extend functions in bwa-mem (implemented in avx2, int16_t stands for score scope)
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "utils.h"
#include "byte_alloc.h"
#include "debug.h"
//#define DEBUG_SW_EXTEND
#define KSW_EQUAL
#define ARR_FOR_EQUAL_CHECK 1
#define ELIMINATE_DIFF_3 0
#define PRUNING 1
#define REVERSE_TARGET 0
#define NO_VAL -1
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define SIMD_WIDTH 16
static int extend_scalar_inner(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}
};
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(a); \
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
__m256i amb_vec = _mm256_set1_epi16(4); \
for (i=0; i<SIMD_WIDTH; ++i) h_vec_mask[i] = _mm256_loadu_si256((__m256i*) (&h_vec_int_mask[i]));
#if REVERSE_TARGET
#define LOAD_TARGET \
__m256i ts_vec = _mm256_loadu_si256((__m256i *) (&ref[i])); \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask);
#else
#define LOAD_TARGET \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[tlen - i]));
#endif
/*
* e ref
* f seq
* m
* h
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i*) (&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i*) (&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i*) (&mA1[j-1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \
LOAD_TARGET
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \
__m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \
__m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \
__m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \
__m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \
__m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \
__m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \
en_vec = _mm256_max_epi16(en_vec, zero_vec); \
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
// 存储向量化结果
#define SIMD_STORE \
max_vec = _mm256_max_epi16(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i*)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i*)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i*)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i*)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end-j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end-j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end-j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end-j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t*)&max_vec; \
m = MAX(m, maxVal[0]); /*用来解决与BSW结果不一样的第二种情况(上边界)*/ \
if (maxVal[0] > 0 && m >= max) { \
for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \
__m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) { \
int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
/*if (m >= max) fprintf(stderr, "%d %d %d %d %d %d %d\n", iend, beg, mi, mj, mask, pos, m);*/ \
} \
} \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
int16_t * tmp=hA0; \
hA0 = hA1; hA1 = hA2; hA2 = tmp; \
tmp = eA1; eA1 = eA2; eA2 = tmp; \
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int extend_avx2_i16(byte_mem_t *bmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数SIMD_BTYES
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
#if 0
int scalar_score = extend_scalar_inner(qlen, query, tlen, target, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off);
return scalar_score;
#endif
int16_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
int16_t *seq, *ref;
uint8_t *mem;
int16_t *qtmem, *vmem;
int is_left = 0; // 是不是向左扩展
int a = 1; // 碱基match时的分数
int b = 4; // 碱基mismatch时的惩罚分数正数
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
SIMD_INIT; // 初始化simd用的数据
assert(h0 > 0);
// allocate memory
//mem = malloc(mem_size);
mem = byte_mem_request(bmem, mem_size);
qtmem = (int16_t *)&mem[0];
seq=&qtmem[0]; ref=&qtmem[seq_size];
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
#if REVERSE_TARGET
for (i = 0; i < tlen; ++i) ref[i + SIMD_WIDTH] = target[tlen - 1 - i];
#else
for (i=0; i<tlen; ++i) ref[i] = target[i];
#endif
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
#if REVERSE_TARGET
for (i = 0; i < tlen; ++i) ref[i + SIMD_WIDTH] = target[i];
#else
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
#endif
}
vmem = &ref[ref_size];
for (i=0; i<(val_mem_size>>1); i+=SIMD_WIDTH) {
_mm256_storeu_si256((__m256i*)&vmem[i], zero_vec);
}
hA = &vmem[0];
mA = &vmem[col_size * 3];
eA = &vmem[col_size * 5];
fA = &vmem[col_size * 7];
hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size];
mA1 = &mA[0]; mA2 = &mA[col_size];
eA1 = &eA[0]; eA2 = &eA[col_size];
fA1 = &fA[0]; fA2 = &fA[col_size];
// adjust $w if it is too large
k = m * m;
// get the max score
for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
if (tlen < qlen) w = MIN(tlen - 1, w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;;
max_off = 0;
beg = 1; end = qlen;
// init h0
hA0[0] = h0; // 左上角
if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况
if (w >= qlen) { max_ie = 0; gscore = 0; }
int m_last=0;
int iend;
#if PRUNING
#ifdef KSW_EQUAL
int midx = 1, icheck = 0, checkspecial = 1;
#if ARR_FOR_EQUAL_CHECK
int marr[10] = {0};
// int marr[b]; memset(marr, 0, 4 * b);
#else
int m3 = 0, m2 = 0, m1 = 0;
#endif
#endif
#endif
//int print_flag = 0; //(qlen == 64 && tlen == 123);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
for (D = 1; LIKELY(D < Dloop); ++D) {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) {
span = MIN(Dloop-D, w);
beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1);
} else {
span = MIN(D-1, w);
beg1 = MAX(1, ((D-w) / 2) + 1);
}
end1 = MIN(qlen, beg1+span);
if (beg < beg1) beg = beg1;
if (end > end1) end = end1;
if (beg > end) break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
iend = D - (beg - 1); // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span - 1; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
//if (print_flag)
//{
//fprintf(stderr, "D: %d, iend: %d, jbeg: %d\n", D, iend, beg);
//}
// 要处理边界
// 左边界 处理f (insert)
if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end];}
// 上边界
if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); }
else if (D & 1) {
hA1[beg - 1] = 0;
hA2[beg - 1] = 0;
}
for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
SIMD_FIND_MAX;
#if PRUNING
#ifdef KSW_EQUAL
// 用来解决与BSW结果不一样的第一种情况(左边界)
#if ARR_FOR_EQUAL_CHECK
if (hA1[0] < b && checkspecial) {
int mii;
if (hA1[0] == b - 1) {
icheck = iend + 1;
}
for (mii = 0; mii < b - 1; ++mii) {
if (midx - mii > 0)
marr[mii] = MAX(marr[mii], hA2[midx - mii]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mii = 0; mii < b - 1; ++mii)
{
stopCalc |= !marr[mii];
}
if (stopCalc)
break;
else
checkspecial = 0;
}
}
#else
if (hA1[0] < 4 && checkspecial) { // b == 4
if (hA1[0] == 3) {
icheck = iend + 1;
} else if (midx == 2) {
m2 = MAX(m2, hA2[midx - 1]);
} else {
m2 = MAX(m2, hA2[midx - 1]);
m1 = MAX(m1, hA2[midx - 2]);
}
m3 = MAX(m3, hA2[midx]);
midx += 1;
if (ibeg > icheck)
{
if (!m1 || !m2 || !m3)
break;
else
checkspecial = 0;
}
//if (print_flag) {
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA2[midx + 1], hA2[midx + 2], hA2[midx + 3], midx);
//if (midx > 2) fprintf(stderr, "%d, %d, %d\n", hA2[midx-1], hA2[midx-2], hA2[midx-3]);
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, hA1: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA1[0], m1, m2, m3, midx);
//}
}
#endif
#endif
#endif
#ifdef DEBUG_SW_EXTEND
for (djj = beg; djj <= end; ++djj)
{
dii = D - djj + 1;
ins[dii][djj] = fA2[djj];
del[dii][djj] = eA2[djj];
score[dii][djj] = hA2[djj];
}
//if (print_flag)
//{
//fprintf(stderr, "score: %d %d %d\n", hA2[beg], hA2[beg+1], hA2[beg+2]);
//}
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1) {
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
#if PRUNING
if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点
#endif
if (m > max) {
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
} else if (m == max && max_i >= mi && mj > max_j) {
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0 && mi > -1) {
// fprintf(stderr, "%d, %d, %d %d\n",mi, mj, max - m - ((mi - max_i) - (mj - max_j)) * e_del, max - m - ((mj - max_j) - (mi - max_i)) * e_ins);
#if PRUNING
if (mi - max_i > mj - max_j) {
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break;
}
#endif
}
// 调整计算的边界
#if PRUNING
for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; }
beg = j;
for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; }
end = j + 1 <= qlen? j + 1 : qlen;
#else
beg = 1; end = qlen;
#endif
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj) {
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0) {
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
} else {
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
#if 0
if (max != scalar_score) {
if (qlen <= 30) {
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
} else if (qlen < 60) {
write_query_target_sequence(qlen, query, tlen, target, h0, 1);
} else if (qlen < 90) {
write_query_target_sequence(qlen, query, tlen, target, h0, 2);
} else {
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
}
}
#endif
// free(mem);
byte_mem_clear(bmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}
typedef struct {
int32_t h, e;
} eh_t;
static int extend_scalar_inner(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
uint8_t *qmem, *ref, *seq;
assert(h0 > 0);
// allocate memory
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
qmem = malloc(qlen + tlen);
seq=(uint8_t*)&qmem[0]; ref=(uint8_t*)&qmem[qlen];
int is_left = 0;
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
}
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[seq[j]];
}
// fill the first row
eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
eh[j].h = eh[j-1].h - e_ins;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
//fprintf(stderr, "%d\n", w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
//int print_flag = 0; //(qlen == 116 && tlen == 241);
//fprintf(stderr, "%d %d %d\n", print_flag, qlen, tlen);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
int bsw_cal_num = 0;
int real_cal_num = 0;
for (i = 0; i < tlen; ++i)
{
int beg = MAX(0, i - w);
int end = MIN(qlen, i + w + 1);
if (beg >= end) break;
bsw_cal_num += end - beg;
}
fprintf(gfp1, "start\n%d\n", bsw_cal_num);
#endif
#endif
#if ELIMINATE_DIFF_3
int prun_end = qlen; // for test diff_3
#endif
for (i = 0; LIKELY(i < tlen); ++i) {
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[ref[i] * qlen];
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
if (end > i + w + 1) end = i + w + 1;
if (end > qlen) end = qlen; // 没用
// compute the first column
if (beg == 0) {
h1 = h0 - (o_del + e_del * (i + 1));
if (h1 < 0) h1 = 0;
} else h1 = 0;
//m = h1; // 用来解决和VP-BSW结果不一样的第一种情况(左边界)
for (j = beg; LIKELY(j < end); ++j) {
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
real_cal_num++;
#endif
#endif
#ifdef DEBUG_SW_EXTEND
ins[i+1][j+1] = f;
#endif
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
p->h = h1; // set H(i,j-1) for the next row
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0sw和nw的区别
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
h = h > f? h : f;
#if ELIMINATE_DIFF_3
if (j >= prun_end && h==0) break; // for test diff_3
#endif
h1 = h; // save H(i,j) to h1 for the next column
#ifdef DEBUG_SW_EXTEND
score[i+1][j+1] = h;
#endif
mj = m > h? mj : j; // record the position where max score is achieved
m = m > h? m : h; // m is stored at eh[mj+1]
t = M - oe_del;
t = t > 0? t : 0;
e -= e_del;
#ifdef DEBUG_SW_EXTEND
del[i + 1][j + 1] = e;
#endif
e = e > t? e : t; // computed E(i+1,j)
#ifdef DEBUG_SW_EXTEND
// del[i+1][j+1] = e;
#endif
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0? t : 0;
f -= e_ins;
f = f > t? f : t; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
#if PRUNING
if (m == 0) break;
#endif
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
//fprintf(stderr, "%d %d %d %d\n", i, mj, max_off, m);
} else if (zdrop > 0) {
#if PRUNING
if (i - max_i > mj - max_j) {
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
}
#endif
}
#if PRUNING
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑finsert score
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
#if ELIMINATE_DIFF_3
prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3
#else
end = j + 2 < qlen? j + 2 : qlen;
#endif
#else
beg = 0; end = qlen; // uncomment this line for debugging
#endif
// if (print_flag) {
// fprintf(stderr, "beg: %d; end: %d\n", beg, end);
// }
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj)
{
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0)
{
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
}
else
{
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
fprintf(gfp1, "%d\nend\n", real_cal_num);
#endif
#endif
free(eh); free(qp); free(qmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}

View File

@ -0,0 +1,852 @@
/*********************************************************************************************
Description: sw extend functions in bwa-mem (implemented in avx2, int16_t stands for score scope)
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "utils.h"
#include "byte_alloc.h"
#include "debug.h"
//#define DEBUG_SW_EXTEND
#define KSW_EQUAL
#define ARR_FOR_EQUAL_CHECK 1
#define ELIMINATE_DIFF_3 0
#define PRUNING 1
#define NO_VAL -1
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x),1)
#define UNLIKELY(x) __builtin_expect((x),0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#define SIMD_WIDTH 16
static int extend_scalar_inner(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}
};
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(a); \
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
__m256i amb_vec = _mm256_set1_epi16(4); \
for (i=0; i<SIMD_WIDTH; ++i) h_vec_mask[i] = _mm256_loadu_si256((__m256i*) (&h_vec_int_mask[i]));
/*
* e ref
* f seq
* m
* h
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i*) (&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i*) (&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i*) (&mA1[j-1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[tlen-i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \
__m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \
__m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \
__m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \
__m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \
__m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \
__m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \
en_vec = _mm256_max_epi16(en_vec, zero_vec); \
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
// 存储向量化结果
#define SIMD_STORE \
max_vec = _mm256_max_epi16(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i*)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i*)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i*)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i*)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end-j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end-j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end-j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end-j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t*)&max_vec; \
m = MAX(m, maxVal[0]); /*用来解决与BSW结果不一样的第二种情况(上边界)*/ \
if (maxVal[0] > 0 && m >= max) { \
for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \
__m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) { \
int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
/*if (m >= max) fprintf(stderr, "%d %d %d %d %d %d %d\n", iend, beg, mi, mj, mask, pos, m);*/ \
} \
} \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
int16_t * tmp=hA0; \
hA0 = hA1; hA1 = hA2; hA2 = tmp; \
tmp = eA1; eA1 = eA2; eA2 = tmp; \
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int extend_avx2_i16_sp(byte_mem_t *bmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数SIMD_BTYES
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
#if 0
int scalar_score = extend_scalar_inner(qlen, query, tlen, target, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off);
return scalar_score;
#endif
int16_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
int16_t *seq, *ref;
uint8_t *mem;
int16_t *qtmem, *vmem;
int is_left = 0; // 是不是向左扩展
int a = 1; // 碱基match时的分数
int b = 4; // 碱基mismatch时的惩罚分数正数
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
SIMD_INIT; // 初始化simd用的数据
assert(h0 > 0);
// allocate memory
//mem = malloc(mem_size);
mem = byte_mem_request(bmem, mem_size);
qtmem = (int16_t *)&mem[0];
seq=&qtmem[0]; ref=&qtmem[seq_size];
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
//for (i=0; i<tlen; ++i) ref[i+SIMD_WIDTH] = target[tlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
//for (i=0; i<tlen; ++i) ref[i+SIMD_WIDTH] = target[i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
}
vmem = &ref[ref_size];
for (i=0; i<(val_mem_size>>1); i+=SIMD_WIDTH) {
_mm256_storeu_si256((__m256i*)&vmem[i], zero_vec);
}
hA = &vmem[0];
mA = &vmem[col_size * 3];
eA = &vmem[col_size * 5];
fA = &vmem[col_size * 7];
hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size];
mA1 = &mA[0]; mA2 = &mA[col_size];
eA1 = &eA[0]; eA2 = &eA[col_size];
fA1 = &fA[0]; fA2 = &fA[col_size];
// adjust $w if it is too large
k = m * m;
// get the max score
for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
if (tlen < qlen) w = MIN(tlen - 1, w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;;
max_off = 0;
beg = 1; end = qlen;
// init h0
hA0[0] = h0; // 左上角
if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况
if (w >= qlen) { max_ie = 0; gscore = 0; }
int m_last = 0;
int iend;
#if PRUNING
#ifdef KSW_EQUAL
int midx = 1, icheck = 0, checkspecial = 1;
#if ARR_FOR_EQUAL_CHECK
int marr[10] = {0};
// int marr[b]; memset(marr, 0, 4 * b);
#else
int m3 = 0, m2 = 0, m1 = 0;
#endif
#endif
#endif
//int print_flag = 0; //(qlen == 64 && tlen == 123);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
for (D = 1; LIKELY(D < Dloop); ++D) {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) {
span = MIN(Dloop-D, w);
beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1);
} else {
span = MIN(D-1, w);
beg1 = MAX(1, ((D-w) / 2) + 1);
}
end1 = MIN(qlen, beg1+span);
if (beg < beg1) beg = beg1;
if (end > end1) end = end1;
if (beg > end) break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
iend = D - (beg - 1); // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span - 1; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
//if (print_flag)
//{
//fprintf(stderr, "D: %d, iend: %d, jbeg: %d\n", D, iend, beg);
//}
// 要处理边界
// 左边界 处理f (insert)
if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end];}
// 上边界
if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); }
else if (D & 1) {
hA1[beg - 1] = 0;
hA2[beg - 1] = 0;
}
for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) {
//__builtin_prefetch(&ref[tlen - i], 0, 2);
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end) {
//__builtin_prefetch(&ref[tlen - i], 0, 2);
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
SIMD_FIND_MAX;
#if PRUNING
#ifdef KSW_EQUAL
// 用来解决与BSW结果不一样的第一种情况(左边界)
#if ARR_FOR_EQUAL_CHECK
if (hA1[0] < b && checkspecial) {
int mii;
if (hA1[0] == b - 1) {
icheck = iend + 1;
}
for (mii = 0; mii < b - 1; ++mii) {
if (midx - mii > 0)
marr[mii] = MAX(marr[mii], hA2[midx - mii]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mii = 0; mii < b - 1; ++mii)
{
stopCalc |= !marr[mii];
}
if (stopCalc)
break;
else
checkspecial = 0;
}
}
#else
if (hA1[0] < 4 && checkspecial) { // b == 4
if (hA1[0] == 3) {
icheck = iend + 1;
} else if (midx == 2) {
m2 = MAX(m2, hA2[midx - 1]);
} else {
m2 = MAX(m2, hA2[midx - 1]);
m1 = MAX(m1, hA2[midx - 2]);
}
m3 = MAX(m3, hA2[midx]);
midx += 1;
if (ibeg > icheck)
{
if (!m1 || !m2 || !m3)
break;
else
checkspecial = 0;
}
//if (print_flag) {
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA2[midx + 1], hA2[midx + 2], hA2[midx + 3], midx);
//if (midx > 2) fprintf(stderr, "%d, %d, %d\n", hA2[midx-1], hA2[midx-2], hA2[midx-3]);
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, hA1: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA1[0], m1, m2, m3, midx);
//}
}
#endif
#endif
#endif
#ifdef DEBUG_SW_EXTEND
for (djj = beg; djj <= end; ++djj)
{
dii = D - djj + 1;
ins[dii][djj] = fA2[djj];
del[dii][djj] = eA2[djj];
score[dii][djj] = hA2[djj];
}
//if (print_flag)
//{
//fprintf(stderr, "score: %d %d %d\n", hA2[beg], hA2[beg+1], hA2[beg+2]);
//}
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1) {
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
#if PRUNING
if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点
#endif
if (m > max) {
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
} else if (m == max && max_i >= mi && mj > max_j) {
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0 && mi > -1) {
#if PRUNING
if (mi - max_i > mj - max_j) {
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break;
}
#endif
}
// 调整计算的边界
#if PRUNING
for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; }
beg = j;
for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; }
end = j + 1 <= qlen? j + 1 : qlen;
#else
beg = 1; end = qlen;
#endif
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj) {
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0) {
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
} else {
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
#if 0
if (max != scalar_score) {
if (qlen <= 30) {
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
} else if (qlen < 60) {
write_query_target_sequence(qlen, query, tlen, target, h0, 1);
} else if (qlen < 90) {
write_query_target_sequence(qlen, query, tlen, target, h0, 2);
} else {
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
}
}
#endif
// free(mem);
byte_mem_clear(bmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}
typedef struct {
int32_t h, e;
} eh_t;
static int extend_scalar_inner(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
uint8_t *qmem, *ref, *seq;
assert(h0 > 0);
// allocate memory
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
qmem = malloc(qlen + tlen);
seq=(uint8_t*)&qmem[0]; ref=(uint8_t*)&qmem[qlen];
int is_left = 0;
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
}
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[seq[j]];
}
// fill the first row
eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
eh[j].h = eh[j-1].h - e_ins;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
//fprintf(stderr, "%d\n", w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
//int print_flag = 0; //(qlen == 116 && tlen == 241);
//fprintf(stderr, "%d %d %d\n", print_flag, qlen, tlen);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
int bsw_cal_num = 0;
int real_cal_num = 0;
for (i = 0; i < tlen; ++i)
{
int beg = MAX(0, i - w);
int end = MIN(qlen, i + w + 1);
if (beg >= end) break;
bsw_cal_num += end - beg;
}
fprintf(gfp1, "start\n%d\n", bsw_cal_num);
#endif
#endif
#if ELIMINATE_DIFF_3
int prun_end = qlen; // for test diff_3
#endif
for (i = 0; LIKELY(i < tlen); ++i) {
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[ref[i] * qlen];
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
if (end > i + w + 1) end = i + w + 1;
if (end > qlen) end = qlen; // 没用
// compute the first column
if (beg == 0) {
h1 = h0 - (o_del + e_del * (i + 1));
if (h1 < 0) h1 = 0;
} else h1 = 0;
//m = h1; // 用来解决和VP-BSW结果不一样的第一种情况(左边界)
for (j = beg; LIKELY(j < end); ++j) {
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
real_cal_num++;
#endif
#endif
#ifdef DEBUG_SW_EXTEND
ins[i+1][j+1] = f;
#endif
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
p->h = h1; // set H(i,j-1) for the next row
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0sw和nw的区别
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
h = h > f? h : f;
#if ELIMINATE_DIFF_3
if (j >= prun_end && h==0) break; // for test diff_3
#endif
h1 = h; // save H(i,j) to h1 for the next column
#ifdef DEBUG_SW_EXTEND
score[i+1][j+1] = h;
#endif
mj = m > h? mj : j; // record the position where max score is achieved
m = m > h? m : h; // m is stored at eh[mj+1]
t = M - oe_del;
t = t > 0? t : 0;
e -= e_del;
#ifdef DEBUG_SW_EXTEND
del[i + 1][j + 1] = e;
#endif
e = e > t? e : t; // computed E(i+1,j)
#ifdef DEBUG_SW_EXTEND
// del[i+1][j+1] = e;
#endif
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0? t : 0;
f -= e_ins;
f = f > t? f : t; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
#if PRUNING
if (m == 0) break;
#endif
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
//fprintf(stderr, "%d %d %d %d\n", i, mj, max_off, m);
} else if (zdrop > 0) {
#if PRUNING
if (i - max_i > mj - max_j) {
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
}
#endif
}
#if PRUNING
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑finsert score
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
#if ELIMINATE_DIFF_3
prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3
#else
end = j + 2 < qlen? j + 2 : qlen;
#endif
#else
beg = 0; end = qlen; // uncomment this line for debugging
#endif
// if (print_flag) {
// fprintf(stderr, "beg: %d; end: %d\n", beg, end);
// }
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj)
{
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0)
{
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
}
else
{
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
fprintf(gfp1, "%d\nend\n", real_cal_num);
#endif
#endif
free(eh); free(qp); free(qmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}

View File

@ -1,3 +1,12 @@
/*********************************************************************************************
Description: sw extend functions in bwa-mem (implemented in avx2, uint8_t stands for score scope)
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
@ -5,8 +14,8 @@
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "thread_mem.h"
#include "common.h"
#include "byte_alloc.h"
#include "utils.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
@ -17,19 +26,13 @@
#endif
#define KSW_EQUAL
#define ARR_FOR_EQUAL_CHECK 1
#define PRUNING 1
#define REVERSE_TARGET 0
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define SIMD_WIDTH 32
typedef struct
{
size_t m;
uint8_t *addr;
} buf_t;
static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
@ -65,7 +68,10 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
// static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
#if REVERSE_TARGET
static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
#endif
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
// 初始化变量
@ -78,7 +84,6 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
__m256i reverse_mask_vec; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi8(oe_del); \
oe_ins_vec = _mm256_set1_epi8(oe_ins); \
@ -88,10 +93,19 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
__m256i mis_sc_vec = _mm256_set1_epi8(b); \
__m256i amb_sc_vec = _mm256_set1_epi8(1); \
__m256i amb_vec = _mm256_set1_epi8(4); \
reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \
for (i = 0; i < SIMD_WIDTH; ++i) \
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
#if REVERSE_TARGET
#define LOAD_TARGET \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i])); \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shuffle_epi8(ts_vec, reverse_mask_vec);
#else
#define LOAD_TARGET \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[tlen - i]));
#endif
/*
* e ref
* f seq
@ -106,12 +120,10 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
__m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j - 1])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
LOAD_TARGET
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shuffle_epi8(ts_vec, reverse_mask_vec); \
__m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
@ -171,7 +183,7 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
m = MAX(m, maxVal[0]); \
if (maxVal[0] > 0) \
if (maxVal[0] > 0 && m >= max) \
{ \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
@ -203,7 +215,7 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
mA1 = mA2; \
mA2 = tmp;
int avx2_u8(thread_mem_t *tmem,
int extend_avx2_u8(byte_mem_t *bmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
@ -239,36 +251,35 @@ int avx2_u8(thread_mem_t *tmem,
int mem_size = seq_size + ref_size + val_mem_size;
SIMD_INIT; // 初始化simd用的数据
// fprintf(stdout, "%d\n", h0);
#if REVERSE_TARGET
__m256i reverse_mask_vec;
reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask));
#endif
assert(h0 > 0);
// allocate memory
mem = malloc(mem_size);
//if (buf->m < mem_size)
//{
// buf->m = mem_size;
// buf->addr = realloc(buf->addr, mem_size);
//}
//mem = buf->addr;
// mem = malloc(mem_size);
mem = byte_mem_request(bmem, mem_size);
qtmem = &mem[0];
seq = (uint8_t *)&qtmem[0];
ref = (uint8_t *)&qtmem[seq_size];
if (is_left)
{
for (i = 0; i < qlen; ++i)
seq[i] = query[qlen - 1 - i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[tlen - 1 - i];
}
else
{
for (i = 0; i < qlen; ++i)
seq[i] = query[i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i];
for (i = 0; i < qlen; ++i) seq[i] = query[qlen - 1 - i];
#if REVERSE_TARGET
for (i = 0; i < tlen; ++i) ref[i + SIMD_WIDTH] = target[tlen - 1 - i];
#else
for (i=0; i<tlen; ++i) ref[i] = target[i];
#endif
} else {
for (i = 0; i < qlen; ++i) seq[i] = query[i];
#if REVERSE_TARGET
for (i = 0; i < tlen; ++i) ref[i + SIMD_WIDTH] = target[i];
#else
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
#endif
}
vmem = &ref[ref_size];
@ -323,13 +334,20 @@ int avx2_u8(thread_mem_t *tmem,
max_ie = 0;
gscore = 0;
}
int m_last = 0;
int iend;
#if PRUNING
#ifdef KSW_EQUAL
int midx = 1, icheck = 0, checkspecial = 1;
#if ARR_FOR_EQUAL_CHECK
int marr[10] = {0};
// int marr[b]; memset(marr, 0, 4 * b);
#else
int m3 = 0, m2 = 0, m1 = 0;
#endif
#endif
#endif
for (D = 1; LIKELY(D < Dloop); ++D)
{
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
@ -406,7 +424,36 @@ int avx2_u8(thread_mem_t *tmem,
SIMD_FIND_MAX;
#if PRUNING
#ifdef KSW_EQUAL
#if ARR_FOR_EQUAL_CHECK
if (hA1[0] < b && checkspecial)
{
int mi;
if (hA1[0] == b - 1)
{
icheck = iend + 1;
}
for (mi = 0; mi < b - 1; ++mi)
{
if (midx - mi > 0)
marr[mi] = MAX(marr[mi], hA2[midx - mi]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mi = 0; mi < b - 1; ++mi)
{
stopCalc |= !marr[mi];
}
if (stopCalc)
break;
else
checkspecial = 0;
}
}
#else
if (hA1[0] < 4 && checkspecial)
{ // b == 4
if (hA1[0] == 3)
@ -433,7 +480,8 @@ int avx2_u8(thread_mem_t *tmem,
}
}
#endif
#endif
#endif
// 注意最后跳出循环j的值
j = end + 1;
@ -442,8 +490,9 @@ int avx2_u8(thread_mem_t *tmem,
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
//if (m == 0 && m_last == 0)
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
#if PRUNING
if (m == 0 && m_last == 0) break; // 一定要注意,斜对角遍历和按列遍历的不同点
#endif
if (m > max)
{
max = m, max_i = mi, max_j = mj;
@ -454,9 +503,9 @@ int avx2_u8(thread_mem_t *tmem,
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0)
else if (zdrop > 0 && mi > -1)
{
#if 0
#if PRUNING
if (mi - max_i > mj - max_j)
{
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop)
@ -471,39 +520,34 @@ int avx2_u8(thread_mem_t *tmem,
}
// 调整计算的边界
// for (j = beg; LIKELY(j <= end); ++j)
// {
// int has_val = hA1[j - 1] | hA2[j];
// if (has_val)
// break;
// }
// beg = j;
// for (j = end + 1; LIKELY(j >= beg); --j)
// {
// int has_val = hA1[j - 1] | hA2[j];
// if (has_val)
// break;
// else
// hA0[j - 1] = 0;
// }
// end = j + 1 <= qlen ? j + 1 : qlen;
#if PRUNING
for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val) break;
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val) break;
else hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
#else
beg = 1; end = qlen;
#endif
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
free(mem);
if (_qle)
*_qle = max_j + 1;
if (_tle)
*_tle = max_i + 1;
if (_gtle)
*_gtle = max_ie + 1;
if (_gscore)
*_gscore = gscore;
if (_max_off)
*_max_off = max_off;
// free(mem);
byte_mem_clear(bmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}

View File

@ -1,9 +1,17 @@
/*********************************************************************************************
Description: sw extend functions in bwa-mem (implemented in scalar)
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include "thread_mem.h"
#include "common.h"
#include "byte_alloc.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
@ -13,12 +21,14 @@
#define UNLIKELY(x) (x)
#endif
#define PRUNING 1
typedef struct
{
int32_t h, e;
} eh_t;
int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
int extend_scalar(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
// return h0;
eh_t *eh; // score array
@ -27,8 +37,8 @@ int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen,
assert(h0 > 0);
// qp = malloc(qlen * m);
// eh = calloc(qlen + 1, 8);
qp = thread_mem_request(tmem, qlen * m);
eh = thread_mem_request_and_clean(tmem, (qlen + 1) * 8);
qp = byte_mem_request(bmem, qlen * m);
eh = byte_mem_request_and_clean(bmem, (qlen + 1) * 8);
// generate the query profile
for (k = i = 0; k < m; ++k)
{
@ -73,11 +83,8 @@ int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen,
if (beg == 0)
{
h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列query从第一个字符开始匹配第i个target字符
if (h1 < 0)
h1 = 0;
}
else
h1 = 0;
if (h1 < 0) h1 = 0;
} else h1 = 0;
for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列
{
@ -112,8 +119,9 @@ int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen,
max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时target字符串的位置
gscore = gscore > h1 ? gscore : h1;
}
if (m == 0) // 遍历完query之后当前轮次的最大分值为0则跳出循环
break;
#if PRUNING
if (m == 0) break; // 遍历完query之后当前轮次的最大分值为0则跳出循环
#endif
if (m > max) // 当前轮最大分值大于之前的最大分值
{
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
@ -121,7 +129,7 @@ int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen,
}
else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{
#if 0
#if PRUNING
if (i - max_i > mj - max_j)
{
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -从取得最大值的点出发当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop
@ -135,17 +143,17 @@ int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen,
#endif
}
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
;
#if PRUNING
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
// beg = 0, end = qlen; // uncomment this line for debugging
#else
beg = 0, end = qlen; // uncomment this line for debugging
#endif
}
// free(eh);
// free(qp);
thread_mem_release(tmem, qlen * m + (qlen + 1) * 8);
// free(eh); free(qp);
byte_mem_clear(bmem);
if (_qle)
*_qle = max_j + 1;
if (_tle)

BIN
get_line

Binary file not shown.

0
global.c 100644
View File

0
global.h 100644
View File

View File

0
global_avx2_u8.c 100644
View File

0
global_scalar.c 100644
View File

90
kvec.h 100644
View File

@ -0,0 +1,90 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "kvec.h"
int main() {
kvec_t(int) array;
kv_init(array);
kv_push(int, array, 10); // append
kv_a(int, array, 20) = 5; // dynamic
kv_A(array, 20) = 4; // static
kv_destroy(array);
return 0;
}
*/
/*
2008-09-22 (0.1.0):
* The initial version.
*/
#ifndef AC_KVEC_H
#define AC_KVEC_H
#include <stdlib.h>
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#define kvec_t(type) struct { size_t n, m; type *a; }
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
#define kv_destroy(v) free((v).a)
#define kv_A(v, i) ((v).a[(i)])
#define kv_pop(v) ((v).a[--(v).n])
#define kv_size(v) ((v).n)
#define kv_max(v) ((v).m)
#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
#define kv_copy(type, v1, v0) do { \
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
(v1).n = (v0).n; \
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
} while (0) \
#define kv_push(type, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)
#define kv_pushp(type, v) ((((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: 0), &(v).a[(v).n++])
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
: 0), (v).a[(i)])
#endif

279
main.c
View File

@ -12,285 +12,22 @@
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include "sys/time.h"
#include "thread_mem.h"
#include "bsw.h"
#include <sys/time.h>
#include "byte_alloc.h"
#include "utils.h"
#include "common.h"
#define BLOCK_BUF_SIZE 1048576
#define READ_BUF_SIZE 2048
#define SEQ_BUF_SIZE (BLOCK_BUF_SIZE + READ_BUF_SIZE)
#define INIT_ALLOC_SIZE 4096
#define DIVIDE_BY (CLOCKS_PER_SEC * 1.0)
#ifdef SHOW_PERF
// 用来调试,计算感兴趣部分的运行时间
// 获取当前毫秒数
int64_t get_mseconds()
{
// struct timeval tv;
// gettimeofday(&tv, NULL);
// return (int64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec));
return clock();
}
int64_t time_sw[KERNEL_NUM] = {0};
#endif
#include "extend.h"
#include "profiling.h"
#define _PERFORMANCE_TEST(kernel_num, func) \
cur_query_pos = 0; \
cur_target_pos = 0; \
for (i = 0; i < block_line_num; ++i) \
{ \
score[kernel_num] = func( \
&tmem[kernel_num], \
info_arr[i][0], \
(uint8_t *)query_arr + cur_query_pos, \
info_arr[i][1], \
(uint8_t *)target_arr + cur_target_pos, \
5, mat, 6, 1, 6, 1, 100, 5, 100, \
info_arr[i][2], \
&qle, &tle, &gtle, &gscore, &max_off[0]); \
score_total[kernel_num] += score[kernel_num]; \
cur_query_pos += info_arr[i][0]; \
cur_target_pos += info_arr[i][1]; \
}
#ifdef SHOW_PERF
#define PERFORMANCE_TEST(kernel_num, func) \
start_time = get_mseconds(); \
_PERFORMANCE_TEST(kernel_num, func); \
time_sw[kernel_num] += get_mseconds() - start_time
#else
#define PERFORMANCE_TEST(kernel_num, func) _PERFORMANCE_TEST(kernel_num, func)
#endif
// 读取一行序列数据
int read_seq_line(char *read_buf, FILE *f_ptr, char *out_arr)
{
if (fgets(read_buf, READ_BUF_SIZE, f_ptr) == NULL)
return -1;
int line_size = strlen(read_buf);
assert(line_size < READ_BUF_SIZE);
if (read_buf[line_size - 1] == '\n')
{
read_buf[line_size - 1] = '\0';
line_size--;
}
convert_char_to_2bit(read_buf);
memcpy(out_arr, read_buf, line_size);
return line_size;
}
// 全局变量
// 将每次比对的得分等信息写入文件进行debug
FILE *ins_ext_f_arr[KERNEL_NUM] = {0},
*del_ext_f_arr[KERNEL_NUM] = {0},
*score_f_arr[KERNEL_NUM] = {0},
*retval_f_arr[KERNEL_NUM] = {0};
#define TEST_EXTEND
// 程序执行入口
int main(int argc, char *argv[])
{
const char *qf_path = argv[1];
const char *tf_path = argv[2];
const char *if_path = argv[3];
// 初始化一些全局参数
int8_t mat[25] = {1, -4, -4, -4, -1,
-4, 1, -4, -4, -1,
-4, -4, 1, -4, -1,
-4, -4, -4, 1, -1,
-1, -1, -1, -1, -1};
int max_off[2];
int qle, tle, gtle, gscore;
thread_mem_t tmem[KERNEL_NUM];
int i, j;
for (i = 0; i < KERNEL_NUM; ++i)
{
thread_mem_init_alloc(tmem + i, INIT_ALLOC_SIZE);
}
// 记录计算出的分数
int score[KERNEL_NUM] = {0};
int64_t score_total[KERNEL_NUM] = {0};
// 读取测试数据
char *query_arr = (char *)malloc(SEQ_BUF_SIZE);
char *target_arr = (char *)malloc(SEQ_BUF_SIZE);
int *info_buf = (int *)malloc(SEQ_BUF_SIZE * sizeof(int));
int **info_arr = (int **)malloc(SEQ_BUF_SIZE * sizeof(int *));
FILE *query_f = 0, *target_f = 0, *info_f = 0;
// 每次读取一定量的数据,然后执行,直到处理完所有数据
int64_t total_line_num = 0; // 目前处理的总的数据行数
int block_line_num = 0; // 当前循环包含的数据行数
int cur_query_pos, cur_target_pos;
int64_t start_time;
char read_buf[READ_BUF_SIZE]; // 读文件缓存
#ifdef DEBUG_OUT
for (i = 0; i < KERNEL_NUM; ++i)
{
char out_path[64];
sprintf(out_path, "/home/zzh/work/sw_perf/output/ins_%d.txt", i);
ins_ext_f_arr[i] = fopen(out_path, "w");
sprintf(out_path, "/home/zzh/work/sw_perf/output/del_%d.txt", i);
del_ext_f_arr[i] = fopen(out_path, "w");
sprintf(out_path, "/home/zzh/work/sw_perf/output/score_%d.txt", i);
score_f_arr[i] = fopen(out_path, "w");
}
#ifdef TEST_EXTEND
main_extend(argc - 1, argv + 1);
#endif
#ifdef DEBUG_RETURN_VALUE
for (i = 0; i < KERNEL_NUM; ++i)
{
char out_path[64];
sprintf(out_path, "/home/zzh/work/sw_perf/output/retval_%d.txt", i);
retval_f_arr[i] = fopen(out_path, "w");
}
#endif
query_f = fopen(qf_path, "r");
target_f = fopen(tf_path, "r");
info_f = fopen(if_path, "r");
// 初始化info_arr数组
i = 0;
j = 0;
while (1)
{
if (j > BLOCK_BUF_SIZE)
break;
info_arr[i] = &info_buf[j];
i += 1;
j += 3;
}
int64_t all_qlen = 0;
while (!feof(target_f))
{
block_line_num = 0; // 记录每次读取的行数
// target序列一般占用存储最多先读取target看一个buf能读多少行query和info就按照这个行数来读
int cur_read_size = 0;
while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE)
{
int line_size = read_seq_line(read_buf, target_f, target_arr + cur_read_size);
// for (j = 0; j < line_size; ++j)
//{
// // fprintf(stderr, "%c", t_2bit2char[(uint8_t)read_buf[j]]);
// fprintf(stderr, "%c", t_2bit2char[(uint8_t)target_arr[j + cur_read_size]]);
// }
// fprintf(stderr, "\n");
// fprintf(retval_f_arr[1], "%d\n", line_size);
if (line_size == -1)
break;
cur_read_size += line_size;
++block_line_num;
++total_line_num;
}
// 读query
cur_read_size = 0;
for (i = 0; i < block_line_num; ++i)
{
int line_size = read_seq_line(read_buf, query_f, query_arr + cur_read_size);
// int j;
// for (j = cur_read_size; j < cur_read_size + line_size; ++j)
//{
// fprintf(retval_f_arr[0], "%c", t_2bit2char[(uint8_t)query_arr[j]]);
// }
// fprintf(retval_f_arr[0], "\n");
if (line_size == -1)
break;
cur_read_size += line_size;
}
// 读info
cur_read_size = 0;
for (i = 0; i < block_line_num; ++i)
{
if (fgets(read_buf, READ_BUF_SIZE, info_f) == NULL)
break;
const int line_size = strlen(read_buf);
assert(line_size < READ_BUF_SIZE);
sscanf(read_buf, "%d %d %d\n", &info_arr[i][0], &info_arr[i][1], &info_arr[i][2]);
cur_read_size += line_size;
// fprintf(stderr, "%-8d%-8d%-8d\n", info_arr[i][0], info_arr[i][1], info_arr[i][2]);
// fprintf(stderr, "%s\n", read_buf);
all_qlen += info_arr[i][0];
}
#ifdef DEBUG_RETURN_VALUE
cur_read_size = 0;
for (i = 0; i < block_line_num; ++i)
{
fprintf(stdout, "%d\n", i);
for (j = cur_read_size; j < cur_read_size + info_arr[i][1]; ++j)
{
fprintf(stdout, "j: %d\n", j);
fprintf(retval_f_arr[0], "%c", t_2bit2char[(uint8_t)target_arr[j]]);
}
fprintf(retval_f_arr[0], "\n");
cur_read_size += info_arr[i][1];
}
#endif
// for (i = 0; i < block_line_num; ++i)
//{
// fprintf(retval_f_arr[0], "%d\n", info_arr[i][1]);
//}
// 性能测试
#if 1
// normal sw
PERFORMANCE_TEST(0, normal);
// normal pruning
PERFORMANCE_TEST(1, normal_pruning);
// avx2
PERFORMANCE_TEST(2, avx2_u8);
// avx2 pruning
PERFORMANCE_TEST(3, avx2_u8_pruning);
#endif
}
fprintf(stderr, "%ld\n", total_line_num);
fprintf(stderr, "all_qlen: %ld\n", all_qlen);
#ifdef SHOW_PERF
char *kernel_names[4] = {
"normal",
"normal_pruning",
"avx2_u8",
"avx2_u8_pruning"};
for (i = 0; i < KERNEL_NUM; ++i)
{
fprintf(stderr, "[%18s] time: %9.6f s; score: %ld\n", kernel_names[i], time_sw[i] / DIVIDE_BY, score_total[i]);
}
#endif
if (query_f != 0)
fclose(query_f);
if (target_f != 0)
fclose(target_f);
if (info_f != 0)
fclose(info_f);
for (i = 0; i < KERNEL_NUM; ++i)
{
if (ins_ext_f_arr[i] != 0)
fclose(ins_ext_f_arr[i]);
if (del_ext_f_arr[i] != 0)
fclose(del_ext_f_arr[i]);
if (score_f_arr[i] != 0)
fclose(score_f_arr[i]);
if (retval_f_arr[i] != 0)
fclose(retval_f_arr[i]);
}
return 0;
}

162
normal.c
View File

@ -1,162 +0,0 @@
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include "thread_mem.h"
#include "common.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
typedef struct
{
int32_t h, e;
} eh_t;
int normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
// return h0;
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
assert(h0 > 0);
qp = malloc(qlen * m);
eh = calloc(qlen + 1, 8);
//qp = thread_mem_request(tmem, qlen * m);
//eh = thread_mem_request_and_clean(tmem, (qlen + 1) * 8);
// generate the query profile
//fprintf(stdout, "%d\n", h0);
for (k = i = 0; k < m; ++k)
{
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j)
qp[i++] = p[query[j]]; // 对于qp数组第0到qlen个元素表示和A比对的分值第qlen到2*qlen表示和C比对的分值以此类推
}
// fill the first row
// 初始化第一行分值
eh[0].h = h0;
eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j)
eh[j].h = eh[j - 1].h - e_ins;
// adjust $w if it is too large
k = m * m; // 字符矩阵
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i] ? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); // 最大可插入的长度?
max_ins = max_ins > 1 ? max_ins : 1;
w = w < max_ins ? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1 ? max_del : 1;
w = w < max_del ? w : max_del; // TODO: is this necessary? 上述几行代码都是为了看看能否缩小窗口,减小计算
// DP loop
max = h0, max_i = max_j = -1;
max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历
{
int t, f = 0, h1, m = 0, mj = -1;
// 对于target第i个字符query中每个字符的分值只有匹配和不匹配
int8_t *q = &qp[target[i] * qlen];
// apply the band and the constraint (if provided)
if (beg < i - w) // 检查开始点是否可以缩小一些
beg = i - w;
if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
end = i + w + 1;
if (end > qlen) // 终点不超过query长度
end = qlen;
// compute the first column
if (beg == 0)
{
h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列query从第一个字符开始匹配第i个target字符
if (h1 < 0)
h1 = 0;
}
else
h1 = 0;
for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列
{
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗querytarget的row id固定query的col index一直增加
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值
p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果
M = M ? M + q[j] : 0; // separating H and M to disallow a cigar like "100M3I3D20M" // M大于0则当前两个字符进行匹配无论是否相等将分值加到M上此时M可能变为负数
h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 // e和f保证是非负的所以h肯定非负即使M可能是负数因为h取e,f和M的最大值
h = h > f ? h : f;
h1 = h; // save H(i,j) to h1 for the next column // 用h1来保存当前表格i,j)对应的分值,用来下次计算
mj = m > h ? mj : j; // record the position where max score is achieved // 记录取得最大值时query的字符位置
m = m > h ? m : h; // m is stored at eh[mj+1] 因为eh[mj+1]->h表示的是H(i, mj)及上一轮记录的h
t = M - oe_del; // 用来计算delete假设当前字符(i,j)匹配无论match还是mismatchtarget下一个字符串被空消耗delete)的分值F(i+1, j)
t = t > 0 ? t : 0;
e -= e_del; // 假设当前query字符
e = e > t ? e : t; // computed E(i+1,j) // t表示(i,j)强行匹配,(i+1, j)是delete的分数此前e表示(i+1,j)继续delete的分数
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0 ? t : 0;
f -= e_ins;
f = f > t ? f : t; // computed F(i,j+1)
}
eh[end].h = h1; // end是query序列之外的位置
eh[end].e = 0;
if (j == qlen) // 此轮遍历到了query的最后一个字符
{
max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时target字符串的位置
gscore = gscore > h1 ? gscore : h1;
}
//if (m == 0) // 遍历完query之后当前轮次的最大分值为0则跳出循环
// break;
if (m > max) // 当前轮最大分值大于之前的最大分值
{
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值
}
else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{
#if 0
if (i - max_i > mj - max_j)
{
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -从取得最大值的点出发当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop
break;
}
else
{
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) // 同上不过这次是insert可能是说明有很多mismatch
break;
}
#endif
}
// update beg and end for the next round
//for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
// ;
//beg = j;
//for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
// ;
//end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
beg = 0, end = qlen; // uncomment this line for debugging
}
free(eh);
free(qp);
// thread_mem_release(tmem, qlen * m + (qlen + 1) * 8);
if (_qle)
*_qle = max_j + 1;
if (_tle)
*_tle = max_i + 1;
if (_gtle)
*_gtle = max_ie + 1;
if (_gscore)
*_gscore = gscore;
if (_max_off)
*_max_off = max_off;
return max;
}

44
profiling.c 100644
View File

@ -0,0 +1,44 @@
/*********************************************************************************************
Description: data for profiling
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include "profiling.h"
uint64_t gprof[LIM_TYPE] = {0};
uint64_t gdata[LIM_TYPE] = {0};
FILE *ins_f_arr[LIM_TYPE],
*del_f_arr[LIM_TYPE],
*score_f_arr[LIM_TYPE],
*retval_f_arr[LIM_TYPE];
/* for profiling time */
uint64_t get_msec()
{
// struct timeval tv;
// gettimeofday(&tv, NULL);
// return (uint64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec));
return clock();
}
/* show excution time */
int display_extend_stats()
{
#ifdef SHOW_PERF
fprintf(stderr, "[extend scalar ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_SCALAR] / TIME_DIVIDE_BY, gdata[G_EXT_SCALAR]);
fprintf(stderr, "[extend avx i16] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16]);
fprintf(stderr, "[extend avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_U8]);
fprintf(stderr, "[extend avx sp ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16_SP] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16_SP]);
#endif
return 0;
}

44
profiling.h 100644
View File

@ -0,0 +1,44 @@
/*********************************************************************************************
Description: data for profiling
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2024/04/08
***********************************************************************************************/
#ifndef __PROFILING_H
#define __PROFILING_H
#include <stdio.h>
#define TIME_DIVIDE_BY (CLOCKS_PER_SEC * 1.0)
#define LIM_TYPE 128
extern uint64_t gprof[LIM_TYPE];
extern uint64_t gdata[LIM_TYPE];
extern FILE *ins_f_arr[LIM_TYPE],
*del_f_arr[LIM_TYPE],
*score_f_arr[LIM_TYPE],
*retval_f_arr[LIM_TYPE];
// GLOBAL
enum
{
G_ALL = 0,
G_EXT_SCALAR,
G_EXT_AVX2_I16,
G_EXT_AVX2_U8,
G_EXT_AVX2_I16_SP,
};
#ifdef SHOW_PERF
#define PROF_START(tmp_time) uint64_t prof_tmp_##tmp_time = get_msec()
#define PROF_END(result, tmp_time) result += get_msec() - prof_tmp_##tmp_time
#else
#define PROF_START(tmp_time)
#define PROF_END(result, tmp_time)
#endif
// get current milli seconds
uint64_t get_msec();
// show excution time
int display_extend_stats();
#endif

9
run.sh
View File

@ -1,5 +1,6 @@
#!/bin/bash
#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_0.fa /home/zzh/work/sw_perf/ext_out/t_0.fa /home/zzh/work/sw_perf/ext_out/i_0.txt
#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_1.fa /home/zzh/work/sw_perf/ext_out/t_1.fa /home/zzh/work/sw_perf/ext_out/i_1.txt
/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_2.fa /home/zzh/work/sw_perf/ext_out/t_2.fa /home/zzh/work/sw_perf/ext_out/i_2.txt
#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_3.fa /home/zzh/work/sw_perf/ext_out/t_3.fa /home/zzh/work/sw_perf/ext_out/i_3.txt
#./sw_perf ./input/diff_ext/q0.txt ./input/diff_ext/t0.txt ./input/diff_ext/i0.txt
./sw_perf ./input/ext/q_0.fa ./input/ext/t_0.fa ./input/ext/i_0.txt
./sw_perf ./input/ext/q_1.fa ./input/ext/t_1.fa ./input/ext/i_1.txt
./sw_perf ./input/ext/q_2.fa ./input/ext/t_2.fa ./input/ext/i_2.txt
./sw_perf ./input/ext/q_3.fa ./input/ext/t_3.fa ./input/ext/i_3.txt

View File

@ -1,98 +0,0 @@
/*********************************************************************************************
Description: In-thread memory allocation with boundary aligned
Copyright : All right reserved by NCIC.ICT
Author : Zhang Zhonghai
Date : 2023/08/23
***********************************************************************************************/
#include "thread_mem.h"
#include <stdio.h>
#include <string.h>
// 创建
thread_mem_t *create_thread_mem()
{
thread_mem_t *tmem = (thread_mem_t *)malloc(sizeof(thread_mem_t));
tmem->occupied = tmem->capacity = 0;
tmem->mem = 0;
return tmem;
}
// 初始化
void init_thread_mem(thread_mem_t *tmem)
{
tmem->occupied = tmem->capacity = 0;
tmem->mem = 0;
}
// 初始化并开辟一定量的内存
void thread_mem_init_alloc(thread_mem_t *tmem, size_t byte_cnt)
{
tmem->capacity = byte_cnt;
tmem->mem = malloc(tmem->capacity);
tmem->occupied = 0;
}
// 请求内存
void *thread_mem_request(thread_mem_t *tmem, size_t byte_cnt)
{
// fprintf(stderr, "capacity:%ld, occupied: %ld, byte_cnt: %ld\n", tmem->capacity, tmem->occupied, byte_cnt);
void *ret_mem = 0;
if (tmem == 0)
{
ret_mem = 0;
}
else if (tmem->capacity == 0)
{
tmem->capacity = byte_cnt;
tmem->mem = malloc(tmem->capacity);
tmem->occupied = byte_cnt;
ret_mem = tmem->mem;
}
else if (tmem->capacity - tmem->occupied >= byte_cnt)
{
ret_mem = tmem->mem + tmem->occupied;
tmem->occupied += byte_cnt;
}
else
{
tmem->capacity = tmem->occupied + byte_cnt;
tmem->mem = realloc(tmem->mem, tmem->capacity);
ret_mem = tmem->mem + tmem->occupied;
tmem->occupied += byte_cnt;
}
return ret_mem;
}
void *thread_mem_request_and_clean(thread_mem_t *tmem, size_t byte_cnt)
{
void *mem = thread_mem_request(tmem, byte_cnt);
memset(mem, 0, byte_cnt);
return mem;
}
// 将不用的内存归还给thread mem
void thread_mem_release(thread_mem_t *tmem, size_t byte_cnt)
{
tmem->occupied -= byte_cnt;
}
// 彻底释放内存
void thread_mem_free(thread_mem_t *tmem)
{
tmem->capacity = tmem->occupied = 0;
free(tmem->mem);
tmem->mem = 0;
}
// 销毁thread_mem_t
void destroy_thread_mem(thread_mem_t *tmem)
{
if (tmem != 0)
{
thread_mem_free(tmem);
free(tmem);
}
}

78
utils.c
View File

@ -6,10 +6,12 @@
Author : Zhang Zhonghai
Date : 2023/08/25
***********************************************************************************************/
#include "utils.h"
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "utils.h"
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@ -29,10 +31,10 @@ unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
char t_2bit2char[5] = {'A', 'C', 'G', 'T', 'N'};
const char *BASE = "ACGTN";
// 将碱基字符转成2位编码
void convert_char_to_2bit(char *str)
void base_char_to_int(char *str)
{
int i;
const int slen = strlen(str);
@ -40,4 +42,74 @@ void convert_char_to_2bit(char *str)
{
str[i] = nst_nt4_table[(uint8_t)str[i]];
}
}
/* read row_num_to_read lines of query-target info */
/**
* info_num: numbers of int value in one line of info file (2: qlen, tlen or 3: qlen, tlen, init_score)
*/
int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp)
{
if (!qti_arr || !buf || !fp) return -1;
if (info_num !=2 && info_num != 3) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char *)buf->addr, buf->m, fp) != NULL)
{
// if (strlen((char *)buf->addr) == 1) continue; // skip null line
if (qti_arr->m <= read_row_num) { // expend the seq array
int old_m = qti_arr->m;
kv_resize(qti_t, *qti_arr, old_m == 0 ? 128 : old_m * 2);
memset(&qti_arr->a[old_m], 0, (qti_arr->m - old_m) * sizeof(qti_t)); // set the newly allocated mem to zero
}
qti_t *qti = &kv_A(*qti_arr, qti_arr->n++); // get a new query-target info
if (qti->m < info_num) { kv_resize(int, *qti, info_num); }
if (info_num == 2) sscanf((char *)buf->addr, "%d %d", &qti->a[0], &qti->a[1]);
else sscanf((char *)buf->addr, "%d %d %d", &qti->a[0], &qti->a[1], &qti->a[2]);
// test output
// int i; for (i = 0; i < info_num; ++i) { fprintf(stderr, "%d\t", kv_A(*qti, i)); } fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
}
/* read row_num_to_read lines of query or target sequence */
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp)
{
if (!seq_arr || !buf || !fp) return -1;
if (buf->m == 0) { buf->m = BUF_SIZE; buf->addr = malloc(buf->m); }
int read_row_num = 0;
while (row_num_to_read > 0 && fgets((char*)buf->addr, buf->m, fp) != NULL)
{
int base_size = strlen((char *)buf->addr);
// trim the '\n' char
if ((char)buf->addr[base_size - 1] == '\n') {
buf->addr[base_size - 1] = (uint8_t)'\0';
--base_size;
}
// if (base_size == 0) continue; // skip null line
// fprintf(stderr, "%d %s\n", base_size, (char *) buf->addr);
// test output
// fprintf(stderr, "%s\n", (char *)buf->addr);
if (seq_arr->m <= read_row_num) { // expend the seq array
int old_m = seq_arr->m;
kv_resize(seq_t, *seq_arr, old_m == 0 ? 128 : old_m * 2);
memset(&seq_arr->a[old_m], 0, (seq_arr->m - old_m) * sizeof(seq_t)); // set the newly allocated mem to zero
}
seq_t *seq = &kv_A(*seq_arr, seq_arr->n++); // get a new query-target sequence
if (seq->m < base_size) { kv_resize(uint8_t, *seq, base_size + 16); }
base_char_to_int((char*)buf->addr);
memcpy(seq->a, buf->addr, base_size);
seq->n = base_size;
// test output
// int i; for (i = 0; i < seq->n; ++i) fprintf(stderr, "%d", kv_A(*seq, i)); fprintf(stderr, "\n");
--row_num_to_read;
++read_row_num;
}
return read_row_num;
}

32
utils.h
View File

@ -9,11 +9,37 @@
#ifndef __UTILS_H
#define __UTILS_H
extern char t_2bit2char[5];
#include <stdlib.h>
#include <stdint.h>
#include "kvec.h"
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#define BUF_SIZE 1048576
typedef kvec_t(uint8_t) seq_t;
typedef kvec_t(seq_t) seq_v;
typedef kvec_t(int) qti_t; // query target info type
typedef kvec_t(qti_t) qti_v;
typedef struct { size_t m; uint8_t *addr; } buf_t;
extern const char *BASE;
extern unsigned char nst_nt4_table[256];
// 将碱基字符转成2位编码
void convert_char_to_2bit(char *str);
// 将碱基字符(ACGTN)转成int编码(0~4)
void base_char_to_int(char *str);
int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, FILE *fp);
int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp);
#endif