解决了读数据的bug,和avx2的bug,保留了一些调试代码

This commit is contained in:
zzh 2023-08-27 01:01:57 +08:00
parent 2aeb566bf7
commit 78f791f3f2
7 changed files with 160 additions and 98 deletions

10
.vscode/settings.json vendored
View File

@ -8,6 +8,14 @@
"__split_buffer": "c",
"string": "c",
"cstdint": "c",
"algorithm": "c"
"algorithm": "c",
"array": "c",
"deque": "c",
"unordered_map": "c",
"string_view": "c",
"initializer_list": "c",
"__hash_table": "c",
"ios": "c",
"iterator": "c"
}
}

View File

@ -1,7 +1,7 @@
CC= gcc
#CFLAGS= -g -Wall -Wno-unused-function -mavx2
CFLAGS= -Wall -Wno-unused-function -O2 -mavx2
DFLAGS= -DSHOW_PERF -DDEBUG_OUT
DFLAGS= -DSHOW_PERF -DDEBUG_RETURN_VALUE
#DFLAGS= -DSHOW_PERF -DDEBUG_OUT -DDEBUG_RETURN_VALUE
PROG= sw_perf
INCLUDES=

View File

@ -109,6 +109,13 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
//int16_t *t_ptr = (int16_t *)&ts_vec; \
//fprintf(stderr, "D: %d, ibeg: %d, iend: %d, jbeg: %d, jend: %d, %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n", \
// D, ibeg, iend, beg, end, \
// t_ptr[0], t_ptr[1], t_ptr[2], t_ptr[3], \
// t_ptr[4], t_ptr[5], t_ptr[6], t_ptr[7], \
// t_ptr[8], t_ptr[9], t_ptr[10], t_ptr[11], \
// t_ptr[12], t_ptr[13], t_ptr[14], t_ptr[15]);
// 存储向量化结果
#define SIMD_STORE \
@ -192,7 +199,7 @@ int ksw_extend_avx2(thread_mem_t *tmem,
uint8_t *mem;
int16_t *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
@ -220,7 +227,7 @@ int ksw_extend_avx2(thread_mem_t *tmem,
for (i = 0; i < qlen; ++i)
seq[i] = query[i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i];
ref[i + SIMD_WIDTH - 1] = target[i];
}
vmem = &ref[ref_size];
@ -258,13 +265,15 @@ int ksw_extend_avx2(thread_mem_t *tmem,
// DP loop
max = init_score, max_i = max_j = -1;
max_ie = -1, gscore = -1;
;
max_off = 0;
beg = 1;
end = qlen;
// init init_score
hA0[0] = init_score; // 左上角
fA1[1] = MAX(0, init_score - (o_ins + e_ins));
eA2[0] = init_score;
hA1[1] = fA1[1];
if (qlen == 0 || tlen == 0)
Dloop = 0; // 防止意外情况
if (window_size >= qlen)
@ -272,66 +281,70 @@ int ksw_extend_avx2(thread_mem_t *tmem,
max_ie = 0;
gscore = 0;
}
int iend;
// fprintf(stderr, "qlen:%d, tlen:%d\n", qlen, tlen);
#ifdef DEBUG_OUT
int16_t ins[tlen + 1][qlen + 1];
int16_t del[tlen + 1][qlen + 1];
int16_t score[tlen + 1][qlen + 1];
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
ins[0][0] = del[0][0] = score[0][0] = init_score;
ins[0][1] = MAX(0, init_score - (o_ins + e_ins));
del[1][0] = MAX(0, init_score - (o_del + e_del));
score[0][1] = ins[0][1];
score[1][0] = del[1][0];
// fprintf(stderr, "%d %d\n", del[1][0], score[1][0]);
#endif
for (D = 1; LIKELY(D < Dloop); ++D)
{
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
{
span = MIN(Dloop - D, window_size);
beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1);
}
if (D < tlen)
beg1 = 1;
else
{
span = MIN(D - 1, window_size);
beg1 = MAX(1, ((D - window_size) / 2) + 1);
}
end1 = MIN(qlen, beg1 + span);
beg1 = D - tlen + 1;
if (D < qlen)
end1 = D; // 闭区间
else
end1 = qlen;
// beg1 = MAX(D - window_size, beg1);
// end1 = MIN(D + window_size, end1);
beg = 1;
end = qlen;
// if (beg < beg1)
// beg = beg1;
// if (end > end1)
// end = end1;
beg = MAX(beg1, beg);
end = MIN(end1, end);
// if (beg > end)
// break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
// break;
iend = D - (beg - 1); // ref开始计算的位置倒序
beg = beg1;
end = end1;
iend = D - beg; // ref开始计算的位置倒序
span = end - beg;
iStart = iend - span - 1; // 0开始的ref索引位置
ibeg = iend - span; // 0开始的ref索引位置
// fprintf(stderr, "D:%d, jbeg:%d, jend:%d, ibeg:%d, iend:%d\n", D, beg, end, ibeg, iend);
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
// 要处理边界
// 左边界 处理f (insert)
if (iStart == 0)
{
hA1[end] = MAX(0, init_score - (o_ins + e_ins * end));
}
// 上边界
// 处理左边界
if (beg == 1)
{
hA1[0] = MAX(0, init_score - (o_del + e_del * iend));
hA0[0] = eA2[0];
mA1[0] = 0;
eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1)));
#ifdef DEBUG_OUT
del[iend + 1][0] = eA1[0];
score[iend + 1][0] = eA1[0];
#endif
}
else
{
hA1[beg - 1] = 0;
eA1[beg - 1] = 0;
}
#ifdef DEBUG_OUT
// fprintf(stderr, "eA1: %d\n", eA1[0]);
// for (djj = beg - 1; djj < end; ++djj)
//{
// fprintf(stderr, "%d ", hA0[djj]);
//}
// fprintf(stderr, "\n");
#endif
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
{
// 取数据
@ -357,15 +370,39 @@ int ksw_extend_avx2(thread_mem_t *tmem,
// 存储结果
SIMD_STORE;
}
// 处理上边界
if (ibeg == 0)
{
fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1)));
hA2[end + 1] = fA2[end + 1];
mA2[end + 1] = 0;
#ifdef DEBUG_OUT
ins[0][end + 1] = fA2[end + 1];
score[0][end + 1] = fA2[end + 1];
#endif
}
SIMD_FIND_MAX;
#ifdef DEBUG_OUT
for (djj = beg; djj <= end; ++djj)
{
dii = D - djj + 1;
// fprintf(stderr, "dii:%d, djj:%d, ", dii, djj);
ins[dii][djj] = fA2[djj];
del[dii][djj] = eA2[djj];
score[dii][djj] = hA2[djj];
}
// fprintf(stderr, "\n");
// fprintf(stderr, "%d, %d\n", hA2[0], hA2[1]);
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1)
{
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
@ -399,6 +436,21 @@ int ksw_extend_avx2(thread_mem_t *tmem,
SWAP_DATA_POINTER;
}
#ifdef DEBUG_OUT
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(score_f_arr[1], "%-3d", score[dii][djj]);
fprintf(ins_ext_f_arr[1], "%-3d", ins[dii][djj]);
fprintf(del_ext_f_arr[1], "%-3d", del[dii][djj]);
}
fprintf(score_f_arr[1], "\n");
fprintf(ins_ext_f_arr[1], "\n");
fprintf(del_ext_f_arr[1], "\n");
}
#endif
// free(mem);
thread_mem_release(tmem, mem_size);
if (_qle)

View File

@ -221,7 +221,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
@ -250,7 +250,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
for (i = 0; i < qlen; ++i)
seq[i] = query[i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i];
ref[i + SIMD_WIDTH - 1] = target[i];
}
vmem = &ref[ref_size];
@ -289,13 +289,15 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
// DP loop
max = init_score, max_i = max_j = -1;
max_ie = -1, gscore = -1;
;
max_off = 0;
beg = 1;
end = qlen;
// init init_score
hA0[0] = init_score; // 左上角
fA1[1] = MAX(0, init_score - (o_ins + e_ins));
eA2[0] = init_score;
hA1[1] = fA1[1];
if (qlen == 0 || tlen == 0)
Dloop = 0; // 防止意外情况
if (window_size >= qlen)
@ -304,55 +306,42 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
gscore = 0;
}
int iend;
for (D = 1; LIKELY(D < Dloop); ++D)
{
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
{
span = MIN(Dloop - D, window_size);
beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1);
}
if (D < tlen)
beg1 = 1;
else
{
span = MIN(D - 1, window_size);
beg1 = MAX(1, ((D - window_size) / 2) + 1);
}
end1 = MIN(qlen, beg1 + span);
beg1 = D - tlen + 1;
if (D < qlen)
end1 = D; // 闭区间
else
end1 = qlen;
// beg1 = MAX(D - window_size, beg1);
// end1 = MIN(D + window_size, end1);
// if (beg < beg1)
// beg = beg1;
// if (end > end1)
// end = end1;
beg = MAX(beg1, beg);
end = MIN(end1, end);
// if (beg > end)
// break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
// break;
beg = 1;
end = qlen;
iend = D - (beg - 1); // ref开始计算的位置倒序
beg = beg1;
end = end1;
iend = D - beg; // ref开始计算的位置倒序
span = end - beg;
iStart = iend - span - 1; // 0开始的ref索引位置
ibeg = iend - span; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
// 要处理边界
// 左边界 处理f (insert)
if (iStart == 0)
{
hA1[end] = MAX(0, init_score - (o_ins + e_ins * end));
}
// 上边界
// 处理左边界
if (beg == 1)
{
hA1[0] = MAX(0, init_score - (o_del + e_del * iend));
}
else
{
hA1[beg - 1] = 0;
eA1[beg - 1] = 0;
hA0[0] = eA2[0];
mA1[0] = 0;
eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1)));
}
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
@ -380,6 +369,13 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
// 存储结果
SIMD_STORE;
}
// 处理上边界
if (ibeg == 0)
{
fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1)));
hA2[end + 1] = fA2[end + 1];
mA2[end + 1] = 0;
}
SIMD_FIND_MAX;
@ -388,7 +384,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
if (j == qlen + 1)
{
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m > max)

View File

@ -74,14 +74,15 @@ int ksw_extend_normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tl
for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历
{
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配
// apply the band and the constraint (if provided)
// if (beg < i - w) // 检查开始点是否可以缩小一些
// beg = i - w;
// if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
// end = i + w + 1;
// if (end > qlen) // 终点不超过query长度
// end = qlen;
// 对于target第i个字符query中每个字符的分值只有匹配和不匹配
int8_t *q = &qp[target[i] * qlen];
// apply the band and the constraint (if provided)
// if (beg < i - w) // 检查开始点是否可以缩小一些
// beg = i - w;
// if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
// end = i + w + 1;
// if (end > qlen) // 终点不超过query长度
// end = qlen;
beg = 0;
end = qlen;
// compute the first column

5
main.c
View File

@ -107,7 +107,7 @@ int read_seq_line(char *read_buf, FILE *f_ptr, char *out_arr)
line_size--;
}
convert_char_to_2bit(read_buf);
strncpy(out_arr, read_buf, line_size);
memcpy(out_arr, read_buf, line_size);
return line_size;
}
@ -204,6 +204,7 @@ int main(int argc, char *argv[])
while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE)
{
int line_size = read_seq_line(read_buf, target_f, target_arr + cur_read_size);
// fprintf(stderr, "line: %d\n", line_size);
if (line_size == 0)
break;
cur_read_size += line_size;
@ -248,7 +249,7 @@ int main(int argc, char *argv[])
// PERFORMANCE_TEST_AVX2(3, ksw_extend_avx2_aligned);
//
// // avx2 u8
// PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8);
PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8);
// // avx2 u8 heuristics
// PERFORMANCE_TEST_AVX2(5, ksw_extend_avx2_u8_heuristics);
// // avx2 u8 mem aligned

View File

@ -9,6 +9,7 @@
#include "utils.h"
#include <string.h>
#include <stdint.h>
#include <stdio.h>
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@ -34,6 +35,9 @@ char t_2bit2char[5] = {'A', 'C', 'G', 'T', 'N'};
void convert_char_to_2bit(char *str)
{
int i;
for (i = 0; i < strlen(str); ++i)
const int slen = strlen(str);
for (i = 0; i < slen; ++i)
{
str[i] = nst_nt4_table[(uint8_t)str[i]];
}
}