From 78f791f3f2b784e40c14702eae03b91ecbfa2983 Mon Sep 17 00:00:00 2001 From: zzh Date: Sun, 27 Aug 2023 01:01:57 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E4=BA=86=E8=AF=BB=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E7=9A=84bug=EF=BC=8C=E5=92=8Cavx2=E7=9A=84bug?= =?UTF-8?q?=EF=BC=8C=E4=BF=9D=E7=95=99=E4=BA=86=E4=B8=80=E4=BA=9B=E8=B0=83?= =?UTF-8?q?=E8=AF=95=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 10 ++- Makefile | 2 +- ksw_ext_avx2.c | 144 ++++++++++++++++++++++++++++-------------- ksw_ext_avx2_u8.c | 74 ++++++++++------------ ksw_ext_normal.c | 17 ++--- main.c | 5 +- utils.c | 6 +- 7 files changed, 160 insertions(+), 98 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index ec0fef7..c3a76e1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,14 @@ "__split_buffer": "c", "string": "c", "cstdint": "c", - "algorithm": "c" + "algorithm": "c", + "array": "c", + "deque": "c", + "unordered_map": "c", + "string_view": "c", + "initializer_list": "c", + "__hash_table": "c", + "ios": "c", + "iterator": "c" } } \ No newline at end of file diff --git a/Makefile b/Makefile index 428213c..a49d410 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC= gcc #CFLAGS= -g -Wall -Wno-unused-function -mavx2 CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 -DFLAGS= -DSHOW_PERF -DDEBUG_OUT +DFLAGS= -DSHOW_PERF -DDEBUG_RETURN_VALUE #DFLAGS= -DSHOW_PERF -DDEBUG_OUT -DDEBUG_RETURN_VALUE PROG= sw_perf INCLUDES= diff --git a/ksw_ext_avx2.c b/ksw_ext_avx2.c index 9ef3d19..c9a1aa0 100644 --- a/ksw_ext_avx2.c +++ b/ksw_ext_avx2.c @@ -109,6 +109,13 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \ mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \ hn_vec = _mm256_max_epi16(hn_vec, zero_vec); +//int16_t *t_ptr = (int16_t *)&ts_vec; \ + //fprintf(stderr, "D: %d, ibeg: %d, iend: %d, jbeg: %d, jend: %d, %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n", \ + // D, ibeg, iend, beg, end, \ + // t_ptr[0], t_ptr[1], t_ptr[2], t_ptr[3], \ + // t_ptr[4], t_ptr[5], t_ptr[6], t_ptr[7], \ + // t_ptr[8], t_ptr[9], t_ptr[10], t_ptr[11], \ + // t_ptr[12], t_ptr[13], t_ptr[14], t_ptr[15]); // 存储向量化结果 #define SIMD_STORE \ @@ -192,7 +199,7 @@ int ksw_extend_avx2(thread_mem_t *tmem, uint8_t *mem; int16_t *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; @@ -220,7 +227,7 @@ int ksw_extend_avx2(thread_mem_t *tmem, for (i = 0; i < qlen; ++i) seq[i] = query[i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH] = target[i]; + ref[i + SIMD_WIDTH - 1] = target[i]; } vmem = &ref[ref_size]; @@ -258,13 +265,15 @@ int ksw_extend_avx2(thread_mem_t *tmem, // DP loop max = init_score, max_i = max_j = -1; max_ie = -1, gscore = -1; - ; + max_off = 0; beg = 1; end = qlen; // init init_score hA0[0] = init_score; // 左上角 - + fA1[1] = MAX(0, init_score - (o_ins + e_ins)); + eA2[0] = init_score; + hA1[1] = fA1[1]; if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 if (window_size >= qlen) @@ -272,66 +281,70 @@ int ksw_extend_avx2(thread_mem_t *tmem, max_ie = 0; gscore = 0; } - - int iend; - +// fprintf(stderr, "qlen:%d, tlen:%d\n", qlen, tlen); #ifdef DEBUG_OUT - int16_t ins[tlen + 1][qlen + 1]; - int16_t del[tlen + 1][qlen + 1]; - int16_t score[tlen + 1][qlen + 1]; + int dii, djj; + int16_t ins[tlen + 1][qlen + 2]; + int16_t del[tlen + 1][qlen + 2]; + int16_t score[tlen + 1][qlen + 2]; ins[0][0] = del[0][0] = score[0][0] = init_score; + ins[0][1] = MAX(0, init_score - (o_ins + e_ins)); + del[1][0] = MAX(0, init_score - (o_del + e_del)); + score[0][1] = ins[0][1]; + score[1][0] = del[1][0]; + // fprintf(stderr, "%d %d\n", del[1][0], score[1][0]); #endif for (D = 1; LIKELY(D < Dloop); ++D) { - // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 - if (D > tlen) - { - span = MIN(Dloop - D, window_size); - beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1); - } + if (D < tlen) + beg1 = 1; else - { - span = MIN(D - 1, window_size); - beg1 = MAX(1, ((D - window_size) / 2) + 1); - } - end1 = MIN(qlen, beg1 + span); + beg1 = D - tlen + 1; + if (D < qlen) + end1 = D; // 闭区间 + else + end1 = qlen; + // beg1 = MAX(D - window_size, beg1); + // end1 = MIN(D + window_size, end1); - beg = 1; - end = qlen; - - // if (beg < beg1) - // beg = beg1; - // if (end > end1) - // end = end1; + beg = MAX(beg1, beg); + end = MIN(end1, end); // if (beg > end) - // break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + // break; - iend = D - (beg - 1); // ref开始计算的位置,倒序 + beg = beg1; + end = end1; + + iend = D - beg; // ref开始计算的位置,倒序 span = end - beg; - iStart = iend - span - 1; // 0开始的ref索引位置 + ibeg = iend - span; // 0开始的ref索引位置 + + // fprintf(stderr, "D:%d, jbeg:%d, jend:%d, ibeg:%d, iend:%d\n", D, beg, end, ibeg, iend); // 每一轮需要记录的数据 int m = 0, mj = -1, mi = -1; max_vec = zero_vec; - // 要处理边界 - // 左边界 处理f (insert) - if (iStart == 0) - { - hA1[end] = MAX(0, init_score - (o_ins + e_ins * end)); - } - // 上边界 + // 处理左边界 if (beg == 1) { - hA1[0] = MAX(0, init_score - (o_del + e_del * iend)); + hA0[0] = eA2[0]; + mA1[0] = 0; + eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); +#ifdef DEBUG_OUT + del[iend + 1][0] = eA1[0]; + score[iend + 1][0] = eA1[0]; +#endif } - else - { - hA1[beg - 1] = 0; - eA1[beg - 1] = 0; - } - +#ifdef DEBUG_OUT + // fprintf(stderr, "eA1: %d\n", eA1[0]); + // for (djj = beg - 1; djj < end; ++djj) + //{ + // fprintf(stderr, "%d ", hA0[djj]); + //} + // fprintf(stderr, "\n"); +#endif for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) { // 取数据 @@ -357,15 +370,39 @@ int ksw_extend_avx2(thread_mem_t *tmem, // 存储结果 SIMD_STORE; } + // 处理上边界 + if (ibeg == 0) + { + fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); + hA2[end + 1] = fA2[end + 1]; + mA2[end + 1] = 0; +#ifdef DEBUG_OUT + ins[0][end + 1] = fA2[end + 1]; + score[0][end + 1] = fA2[end + 1]; +#endif + } SIMD_FIND_MAX; +#ifdef DEBUG_OUT + for (djj = beg; djj <= end; ++djj) + { + dii = D - djj + 1; + // fprintf(stderr, "dii:%d, djj:%d, ", dii, djj); + ins[dii][djj] = fA2[djj]; + del[dii][djj] = eA2[djj]; + score[dii][djj] = hA2[djj]; + } + // fprintf(stderr, "\n"); + // fprintf(stderr, "%d, %d\n", hA2[0], hA2[1]); +#endif + // 注意最后跳出循环j的值 j = end + 1; if (j == qlen + 1) { - max_ie = gscore > hA2[qlen] ? max_ie : iStart; + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } @@ -399,6 +436,21 @@ int ksw_extend_avx2(thread_mem_t *tmem, SWAP_DATA_POINTER; } +#ifdef DEBUG_OUT + for (dii = 0; dii <= tlen; ++dii) + { + for (djj = 0; djj <= qlen; ++djj) + { + fprintf(score_f_arr[1], "%-3d", score[dii][djj]); + fprintf(ins_ext_f_arr[1], "%-3d", ins[dii][djj]); + fprintf(del_ext_f_arr[1], "%-3d", del[dii][djj]); + } + fprintf(score_f_arr[1], "\n"); + fprintf(ins_ext_f_arr[1], "\n"); + fprintf(del_ext_f_arr[1], "\n"); + } +#endif + // free(mem); thread_mem_release(tmem, mem_size); if (_qle) diff --git a/ksw_ext_avx2_u8.c b/ksw_ext_avx2_u8.c index 83119df..8b38cb3 100644 --- a/ksw_ext_avx2_u8.c +++ b/ksw_ext_avx2_u8.c @@ -221,7 +221,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, uint8_t *seq, *ref; uint8_t *mem, *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, iStart, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; @@ -250,7 +250,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, for (i = 0; i < qlen; ++i) seq[i] = query[i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH] = target[i]; + ref[i + SIMD_WIDTH - 1] = target[i]; } vmem = &ref[ref_size]; @@ -289,13 +289,15 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, // DP loop max = init_score, max_i = max_j = -1; max_ie = -1, gscore = -1; - ; + max_off = 0; beg = 1; end = qlen; // init init_score hA0[0] = init_score; // 左上角 - + fA1[1] = MAX(0, init_score - (o_ins + e_ins)); + eA2[0] = init_score; + hA1[1] = fA1[1]; if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 if (window_size >= qlen) @@ -304,55 +306,42 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, gscore = 0; } - int iend; - for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 - if (D > tlen) - { - span = MIN(Dloop - D, window_size); - beg1 = MAX(D - tlen + 1, ((D - window_size) / 2) + 1); - } + if (D < tlen) + beg1 = 1; else - { - span = MIN(D - 1, window_size); - beg1 = MAX(1, ((D - window_size) / 2) + 1); - } - end1 = MIN(qlen, beg1 + span); + beg1 = D - tlen + 1; + if (D < qlen) + end1 = D; // 闭区间 + else + end1 = qlen; + // beg1 = MAX(D - window_size, beg1); + // end1 = MIN(D + window_size, end1); - // if (beg < beg1) - // beg = beg1; - // if (end > end1) - // end = end1; + beg = MAX(beg1, beg); + end = MIN(end1, end); // if (beg > end) - // break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug + // break; - beg = 1; - end = qlen; - iend = D - (beg - 1); // ref开始计算的位置,倒序 + beg = beg1; + end = end1; + + iend = D - beg; // ref开始计算的位置,倒序 span = end - beg; - iStart = iend - span - 1; // 0开始的ref索引位置 + ibeg = iend - span; // 0开始的ref索引位置 // 每一轮需要记录的数据 int m = 0, mj = -1, mi = -1; max_vec = zero_vec; - // 要处理边界 - // 左边界 处理f (insert) - if (iStart == 0) - { - hA1[end] = MAX(0, init_score - (o_ins + e_ins * end)); - } - // 上边界 + // 处理左边界 if (beg == 1) { - hA1[0] = MAX(0, init_score - (o_del + e_del * iend)); - } - else - { - hA1[beg - 1] = 0; - eA1[beg - 1] = 0; + hA0[0] = eA2[0]; + mA1[0] = 0; + eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); } for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) @@ -380,6 +369,13 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, // 存储结果 SIMD_STORE; } + // 处理上边界 + if (ibeg == 0) + { + fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); + hA2[end + 1] = fA2[end + 1]; + mA2[end + 1] = 0; + } SIMD_FIND_MAX; @@ -388,7 +384,7 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, if (j == qlen + 1) { - max_ie = gscore > hA2[qlen] ? max_ie : iStart; + max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } if (m > max) diff --git a/ksw_ext_normal.c b/ksw_ext_normal.c index 5b41049..cc404c7 100644 --- a/ksw_ext_normal.c +++ b/ksw_ext_normal.c @@ -74,14 +74,15 @@ int ksw_extend_normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tl for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 { int t, f = 0, h1, m = 0, mj = -1; - int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 - // apply the band and the constraint (if provided) - // if (beg < i - w) // 检查开始点是否可以缩小一些 - // beg = i - w; - // if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 - // end = i + w + 1; - // if (end > qlen) // 终点不超过query长度 - // end = qlen; + // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 + int8_t *q = &qp[target[i] * qlen]; + // apply the band and the constraint (if provided) + // if (beg < i - w) // 检查开始点是否可以缩小一些 + // beg = i - w; + // if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 + // end = i + w + 1; + // if (end > qlen) // 终点不超过query长度 + // end = qlen; beg = 0; end = qlen; // compute the first column diff --git a/main.c b/main.c index 211f0e2..137f592 100644 --- a/main.c +++ b/main.c @@ -107,7 +107,7 @@ int read_seq_line(char *read_buf, FILE *f_ptr, char *out_arr) line_size--; } convert_char_to_2bit(read_buf); - strncpy(out_arr, read_buf, line_size); + memcpy(out_arr, read_buf, line_size); return line_size; } @@ -204,6 +204,7 @@ int main(int argc, char *argv[]) while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE) { int line_size = read_seq_line(read_buf, target_f, target_arr + cur_read_size); + // fprintf(stderr, "line: %d\n", line_size); if (line_size == 0) break; cur_read_size += line_size; @@ -248,7 +249,7 @@ int main(int argc, char *argv[]) // PERFORMANCE_TEST_AVX2(3, ksw_extend_avx2_aligned); // // // avx2 u8 - // PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8); + PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8); // // avx2 u8 heuristics // PERFORMANCE_TEST_AVX2(5, ksw_extend_avx2_u8_heuristics); // // avx2 u8 mem aligned diff --git a/utils.c b/utils.c index 33f0865..2fef78a 100644 --- a/utils.c +++ b/utils.c @@ -9,6 +9,7 @@ #include "utils.h" #include #include +#include unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -34,6 +35,9 @@ char t_2bit2char[5] = {'A', 'C', 'G', 'T', 'N'}; void convert_char_to_2bit(char *str) { int i; - for (i = 0; i < strlen(str); ++i) + const int slen = strlen(str); + for (i = 0; i < slen; ++i) + { str[i] = nst_nt4_table[(uint8_t)str[i]]; + } } \ No newline at end of file