解决了读数据的bug,和avx2的bug,保留了一些调试代码

This commit is contained in:
zzh 2023-08-27 01:01:57 +08:00
parent 2aeb566bf7
commit 78f791f3f2
7 changed files with 160 additions and 98 deletions

10
.vscode/settings.json vendored
View File

@ -8,6 +8,14 @@
"__split_buffer": "c", "__split_buffer": "c",
"string": "c", "string": "c",
"cstdint": "c", "cstdint": "c",
"algorithm": "c" "algorithm": "c",
"array": "c",
"deque": "c",
"unordered_map": "c",
"string_view": "c",
"initializer_list": "c",
"__hash_table": "c",
"ios": "c",
"iterator": "c"
} }
} }

View File

@ -1,7 +1,7 @@
CC= gcc CC= gcc
#CFLAGS= -g -Wall -Wno-unused-function -mavx2 #CFLAGS= -g -Wall -Wno-unused-function -mavx2
CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 CFLAGS= -Wall -Wno-unused-function -O2 -mavx2
DFLAGS= -DSHOW_PERF -DDEBUG_OUT DFLAGS= -DSHOW_PERF -DDEBUG_RETURN_VALUE
#DFLAGS= -DSHOW_PERF -DDEBUG_OUT -DDEBUG_RETURN_VALUE #DFLAGS= -DSHOW_PERF -DDEBUG_OUT -DDEBUG_RETURN_VALUE
PROG= sw_perf PROG= sw_perf
INCLUDES= INCLUDES=

View File

@ -109,6 +109,13 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \ fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \ mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec); hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
//int16_t *t_ptr = (int16_t *)&ts_vec; \
//fprintf(stderr, "D: %d, ibeg: %d, iend: %d, jbeg: %d, jend: %d, %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n", \
// D, ibeg, iend, beg, end, \
// t_ptr[0], t_ptr[1], t_ptr[2], t_ptr[3], \
// t_ptr[4], t_ptr[5], t_ptr[6], t_ptr[7], \
// t_ptr[8], t_ptr[9], t_ptr[10], t_ptr[11], \
// t_ptr[12], t_ptr[13], t_ptr[14], t_ptr[15]);
// 存储向量化结果 // 存储向量化结果
#define SIMD_STORE \ #define SIMD_STORE \
@ -192,7 +199,7 @@ int ksw_extend_avx2(thread_mem_t *tmem,
uint8_t *mem; uint8_t *mem;
int16_t *qtmem, *vmem; int16_t *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件 int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算 int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH; int col_size = qlen + 2 + SIMD_WIDTH;
@ -220,7 +227,7 @@ int ksw_extend_avx2(thread_mem_t *tmem,
for (i = 0; i < qlen; ++i) for (i = 0; i < qlen; ++i)
seq[i] = query[i]; seq[i] = query[i];
for (i = 0; i < tlen; ++i) for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i]; ref[i + SIMD_WIDTH - 1] = target[i];
} }
vmem = &ref[ref_size]; vmem = &ref[ref_size];
@ -258,13 +265,15 @@ int ksw_extend_avx2(thread_mem_t *tmem,
// DP loop // DP loop
max = init_score, max_i = max_j = -1; max = init_score, max_i = max_j = -1;
max_ie = -1, gscore = -1; max_ie = -1, gscore = -1;
;
max_off = 0; max_off = 0;
beg = 1; beg = 1;
end = qlen; end = qlen;
// init init_score // init init_score
hA0[0] = init_score; // 左上角 hA0[0] = init_score; // 左上角
fA1[1] = MAX(0, init_score - (o_ins + e_ins));
eA2[0] = init_score;
hA1[1] = fA1[1];
if (qlen == 0 || tlen == 0) if (qlen == 0 || tlen == 0)
Dloop = 0; // 防止意外情况 Dloop = 0; // 防止意外情况
if (window_size >= qlen) if (window_size >= qlen)
@ -272,66 +281,70 @@ int ksw_extend_avx2(thread_mem_t *tmem,
max_ie = 0; max_ie = 0;
gscore = 0; gscore = 0;
} }
// fprintf(stderr, "qlen:%d, tlen:%d\n", qlen, tlen);
int iend;
#ifdef DEBUG_OUT #ifdef DEBUG_OUT
int16_t ins[tlen + 1][qlen + 1]; int dii, djj;
int16_t del[tlen + 1][qlen + 1]; int16_t ins[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 1]; int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
ins[0][0] = del[0][0] = score[0][0] = init_score; ins[0][0] = del[0][0] = score[0][0] = init_score;
ins[0][1] = MAX(0, init_score - (o_ins + e_ins));
del[1][0] = MAX(0, init_score - (o_del + e_del));
score[0][1] = ins[0][1];
score[1][0] = del[1][0];
// fprintf(stderr, "%d %d\n", del[1][0], score[1][0]);
#endif #endif
for (D = 1; LIKELY(D < Dloop); ++D) for (D = 1; LIKELY(D < Dloop); ++D)
{ {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 if (D < tlen)
if (D > tlen) beg1 = 1;
{
span = MIN(Dloop - D, window_size);
beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1);
}
else else
{ beg1 = D - tlen + 1;
span = MIN(D - 1, window_size); if (D < qlen)
beg1 = MAX(1, ((D - window_size) / 2) + 1); end1 = D; // 闭区间
} else
end1 = MIN(qlen, beg1 + span); end1 = qlen;
// beg1 = MAX(D - window_size, beg1);
// end1 = MIN(D + window_size, end1);
beg = 1; beg = MAX(beg1, beg);
end = qlen; end = MIN(end1, end);
// if (beg < beg1)
// beg = beg1;
// if (end > end1)
// end = end1;
// if (beg > end) // if (beg > end)
// break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug // break;
iend = D - (beg - 1); // ref开始计算的位置倒序 beg = beg1;
end = end1;
iend = D - beg; // ref开始计算的位置倒序
span = end - beg; span = end - beg;
iStart = iend - span - 1; // 0开始的ref索引位置 ibeg = iend - span; // 0开始的ref索引位置
// fprintf(stderr, "D:%d, jbeg:%d, jend:%d, ibeg:%d, iend:%d\n", D, beg, end, ibeg, iend);
// 每一轮需要记录的数据 // 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1; int m = 0, mj = -1, mi = -1;
max_vec = zero_vec; max_vec = zero_vec;
// 要处理边界 // 处理左边界
// 左边界 处理f (insert)
if (iStart == 0)
{
hA1[end] = MAX(0, init_score - (o_ins + e_ins * end));
}
// 上边界
if (beg == 1) if (beg == 1)
{ {
hA1[0] = MAX(0, init_score - (o_del + e_del * iend)); hA0[0] = eA2[0];
mA1[0] = 0;
eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1)));
#ifdef DEBUG_OUT
del[iend + 1][0] = eA1[0];
score[iend + 1][0] = eA1[0];
#endif
} }
else #ifdef DEBUG_OUT
{ // fprintf(stderr, "eA1: %d\n", eA1[0]);
hA1[beg - 1] = 0; // for (djj = beg - 1; djj < end; ++djj)
eA1[beg - 1] = 0; //{
} // fprintf(stderr, "%d ", hA0[djj]);
//}
// fprintf(stderr, "\n");
#endif
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
{ {
// 取数据 // 取数据
@ -357,15 +370,39 @@ int ksw_extend_avx2(thread_mem_t *tmem,
// 存储结果 // 存储结果
SIMD_STORE; SIMD_STORE;
} }
// 处理上边界
if (ibeg == 0)
{
fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1)));
hA2[end + 1] = fA2[end + 1];
mA2[end + 1] = 0;
#ifdef DEBUG_OUT
ins[0][end + 1] = fA2[end + 1];
score[0][end + 1] = fA2[end + 1];
#endif
}
SIMD_FIND_MAX; SIMD_FIND_MAX;
#ifdef DEBUG_OUT
for (djj = beg; djj <= end; ++djj)
{
dii = D - djj + 1;
// fprintf(stderr, "dii:%d, djj:%d, ", dii, djj);
ins[dii][djj] = fA2[djj];
del[dii][djj] = eA2[djj];
score[dii][djj] = hA2[djj];
}
// fprintf(stderr, "\n");
// fprintf(stderr, "%d, %d\n", hA2[0], hA2[1]);
#endif
// 注意最后跳出循环j的值 // 注意最后跳出循环j的值
j = end + 1; j = end + 1;
if (j == qlen + 1) if (j == qlen + 1)
{ {
max_ie = gscore > hA2[qlen] ? max_ie : iStart; max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
} }
@ -399,6 +436,21 @@ int ksw_extend_avx2(thread_mem_t *tmem,
SWAP_DATA_POINTER; SWAP_DATA_POINTER;
} }
#ifdef DEBUG_OUT
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(score_f_arr[1], "%-3d", score[dii][djj]);
fprintf(ins_ext_f_arr[1], "%-3d", ins[dii][djj]);
fprintf(del_ext_f_arr[1], "%-3d", del[dii][djj]);
}
fprintf(score_f_arr[1], "\n");
fprintf(ins_ext_f_arr[1], "\n");
fprintf(del_ext_f_arr[1], "\n");
}
#endif
// free(mem); // free(mem);
thread_mem_release(tmem, mem_size); thread_mem_release(tmem, mem_size);
if (_qle) if (_qle)

View File

@ -221,7 +221,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
uint8_t *seq, *ref; uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem; uint8_t *mem, *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件 int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算 int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH; int col_size = qlen + 2 + SIMD_WIDTH;
@ -250,7 +250,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
for (i = 0; i < qlen; ++i) for (i = 0; i < qlen; ++i)
seq[i] = query[i]; seq[i] = query[i];
for (i = 0; i < tlen; ++i) for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH] = target[i]; ref[i + SIMD_WIDTH - 1] = target[i];
} }
vmem = &ref[ref_size]; vmem = &ref[ref_size];
@ -289,13 +289,15 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
// DP loop // DP loop
max = init_score, max_i = max_j = -1; max = init_score, max_i = max_j = -1;
max_ie = -1, gscore = -1; max_ie = -1, gscore = -1;
;
max_off = 0; max_off = 0;
beg = 1; beg = 1;
end = qlen; end = qlen;
// init init_score // init init_score
hA0[0] = init_score; // 左上角 hA0[0] = init_score; // 左上角
fA1[1] = MAX(0, init_score - (o_ins + e_ins));
eA2[0] = init_score;
hA1[1] = fA1[1];
if (qlen == 0 || tlen == 0) if (qlen == 0 || tlen == 0)
Dloop = 0; // 防止意外情况 Dloop = 0; // 防止意外情况
if (window_size >= qlen) if (window_size >= qlen)
@ -304,55 +306,42 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
gscore = 0; gscore = 0;
} }
int iend;
for (D = 1; LIKELY(D < Dloop); ++D) for (D = 1; LIKELY(D < Dloop); ++D)
{ {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) if (D < tlen)
{ beg1 = 1;
span = MIN(Dloop - D, window_size);
beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1);
}
else else
{ beg1 = D - tlen + 1;
span = MIN(D - 1, window_size); if (D < qlen)
beg1 = MAX(1, ((D - window_size) / 2) + 1); end1 = D; // 闭区间
} else
end1 = MIN(qlen, beg1 + span); end1 = qlen;
// beg1 = MAX(D - window_size, beg1);
// end1 = MIN(D + window_size, end1);
// if (beg < beg1) beg = MAX(beg1, beg);
// beg = beg1; end = MIN(end1, end);
// if (end > end1)
// end = end1;
// if (beg > end) // if (beg > end)
// break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug // break;
beg = 1; beg = beg1;
end = qlen; end = end1;
iend = D - (beg - 1); // ref开始计算的位置倒序
iend = D - beg; // ref开始计算的位置倒序
span = end - beg; span = end - beg;
iStart = iend - span - 1; // 0开始的ref索引位置 ibeg = iend - span; // 0开始的ref索引位置
// 每一轮需要记录的数据 // 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1; int m = 0, mj = -1, mi = -1;
max_vec = zero_vec; max_vec = zero_vec;
// 要处理边界 // 处理左边界
// 左边界 处理f (insert)
if (iStart == 0)
{
hA1[end] = MAX(0, init_score - (o_ins + e_ins * end));
}
// 上边界
if (beg == 1) if (beg == 1)
{ {
hA1[0] = MAX(0, init_score - (o_del + e_del * iend)); hA0[0] = eA2[0];
} mA1[0] = 0;
else eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1)));
{
hA1[beg - 1] = 0;
eA1[beg - 1] = 0;
} }
for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
@ -380,6 +369,13 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
// 存储结果 // 存储结果
SIMD_STORE; SIMD_STORE;
} }
// 处理上边界
if (ibeg == 0)
{
fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1)));
hA2[end + 1] = fA2[end + 1];
mA2[end + 1] = 0;
}
SIMD_FIND_MAX; SIMD_FIND_MAX;
@ -388,7 +384,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem,
if (j == qlen + 1) if (j == qlen + 1)
{ {
max_ie = gscore > hA2[qlen] ? max_ie : iStart; max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
} }
if (m > max) if (m > max)

View File

@ -74,14 +74,15 @@ int ksw_extend_normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tl
for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历
{ {
int t, f = 0, h1, m = 0, mj = -1; int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配 // 对于target第i个字符query中每个字符的分值只有匹配和不匹配
// apply the band and the constraint (if provided) int8_t *q = &qp[target[i] * qlen];
// if (beg < i - w) // 检查开始点是否可以缩小一些 // apply the band and the constraint (if provided)
// beg = i - w; // if (beg < i - w) // 检查开始点是否可以缩小一些
// if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 // beg = i - w;
// end = i + w + 1; // if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
// if (end > qlen) // 终点不超过query长度 // end = i + w + 1;
// end = qlen; // if (end > qlen) // 终点不超过query长度
// end = qlen;
beg = 0; beg = 0;
end = qlen; end = qlen;
// compute the first column // compute the first column

5
main.c
View File

@ -107,7 +107,7 @@ int read_seq_line(char *read_buf, FILE *f_ptr, char *out_arr)
line_size--; line_size--;
} }
convert_char_to_2bit(read_buf); convert_char_to_2bit(read_buf);
strncpy(out_arr, read_buf, line_size); memcpy(out_arr, read_buf, line_size);
return line_size; return line_size;
} }
@ -204,6 +204,7 @@ int main(int argc, char *argv[])
while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE) while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE)
{ {
int line_size = read_seq_line(read_buf, target_f, target_arr + cur_read_size); int line_size = read_seq_line(read_buf, target_f, target_arr + cur_read_size);
// fprintf(stderr, "line: %d\n", line_size);
if (line_size == 0) if (line_size == 0)
break; break;
cur_read_size += line_size; cur_read_size += line_size;
@ -248,7 +249,7 @@ int main(int argc, char *argv[])
// PERFORMANCE_TEST_AVX2(3, ksw_extend_avx2_aligned); // PERFORMANCE_TEST_AVX2(3, ksw_extend_avx2_aligned);
// //
// // avx2 u8 // // avx2 u8
// PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8); PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8);
// // avx2 u8 heuristics // // avx2 u8 heuristics
// PERFORMANCE_TEST_AVX2(5, ksw_extend_avx2_u8_heuristics); // PERFORMANCE_TEST_AVX2(5, ksw_extend_avx2_u8_heuristics);
// // avx2 u8 mem aligned // // avx2 u8 mem aligned

View File

@ -9,6 +9,7 @@
#include "utils.h" #include "utils.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
unsigned char nst_nt4_table[256] = { unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@ -34,6 +35,9 @@ char t_2bit2char[5] = {'A', 'C', 'G', 'T', 'N'};
void convert_char_to_2bit(char *str) void convert_char_to_2bit(char *str)
{ {
int i; int i;
for (i = 0; i < strlen(str); ++i) const int slen = strlen(str);
for (i = 0; i < slen; ++i)
{
str[i] = nst_nt4_table[(uint8_t)str[i]]; str[i] = nst_nt4_table[(uint8_t)str[i]];
}
} }