对avx2实现的sw的各个部分运行时间进行了测试

This commit is contained in:
zzh 2023-08-18 15:42:46 +08:00
parent 2a14c17c68
commit 53332e34e1
5 changed files with 133 additions and 60 deletions

View File

@ -1,8 +1,8 @@
CC= gcc CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2 #CFLAGS= -g -Wall -Wno-unused-function -mavx2
#CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 CFLAGS= -Wall -Wno-unused-function -O2 -mavx2
DFLAGS= -DSHOW_PERF DFLAGS= -DSHOW_PERF
OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o bsw_avx2.o
PROG= sw_perf PROG= sw_perf
PROG2= sw_perf_discrete PROG2= sw_perf_discrete
INCLUDES= INCLUDES=

View File

@ -197,7 +197,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
int16_t *qtmem, *vmem; int16_t *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件 int Dloop = tlen + qlen; // 循环跳出条件 D从1开始遍历
int span, beg1, end1; // 边界条件计算 int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH; int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
@ -287,7 +287,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) if (D > tlen)
{ {
span = MIN(Dloop - D, w); span = MIN(Dloop - D, w); // 计算的窗口,或者说范围
beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1); beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1);
} }
else else
@ -372,7 +372,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
max = m, max_i = mi, max_j = mj; max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
} }
else if (zdrop > 0) else if (0) //(zdrop > 0)
{ {
if (mi - max_i > mj - max_j) if (mi - max_i > mj - max_j)
{ {
@ -387,24 +387,25 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
} }
// 调整计算的边界 // 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j) /* for (j = beg; LIKELY(j <= end); ++j)
{ {
int has_val = hA1[j - 1] | hA2[j]; int has_val = hA1[j - 1] | hA2[j];
if (has_val) if (has_val)
break; break;
} }
beg = j; beg = j;
for (j = end + 1; LIKELY(j >= beg); --j) for (j = end + 1; LIKELY(j >= beg); --j)
{ {
int has_val = hA1[j - 1] | hA2[j]; int has_val = hA1[j - 1] | hA2[j];
if (has_val) if (has_val)
break; break;
else else
hA0[j - 1] = 0; hA0[j - 1] = 0;
} }
end = j + 1 <= qlen ? j + 1 : qlen; end = j + 1 <= qlen ? j + 1 : qlen;
// beg = 0; */
// end = qlen; // uncomment this line for debugging beg = 0;
end = qlen; // uncomment this line for debugging
m_last = m; m_last = m;
// swap m, h, e, f // swap m, h, e, f
SWAP_DATA_POINTER; SWAP_DATA_POINTER;

View File

@ -217,6 +217,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int *_gscore, // query的端到端匹配得分 int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{ {
#ifdef SHOW_PERF
extern int64_t time_bsw_init;
extern int64_t time_bsw_main_loop;
extern int64_t time_bsw_find_max;
extern int64_t time_bsw_adjust_bound;
extern int64_t time_compare;
int64_t start_time = get_mseconds();
#endif
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
uint8_t *seq, *ref; uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem; uint8_t *mem, *qtmem, *vmem;
@ -307,9 +315,15 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int m_last = 0; int m_last = 0;
int iend; int iend;
#ifdef SHOW_PERF
time_bsw_init += get_mseconds() - start_time;
#endif
for (D = 1; LIKELY(D < Dloop); ++D) for (D = 1; LIKELY(D < Dloop); ++D)
{ {
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) if (D > tlen)
{ {
@ -380,9 +394,19 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
// 存储结果 // 存储结果
SIMD_STORE; SIMD_STORE;
} }
#ifdef SHOW_PERF
time_bsw_main_loop += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
SIMD_FIND_MAX; SIMD_FIND_MAX;
#ifdef SHOW_PERF
time_bsw_find_max += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
// 注意最后跳出循环j的值 // 注意最后跳出循环j的值
j = end + 1; j = end + 1;
@ -391,14 +415,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
max_ie = gscore > hA2[qlen] ? max_ie : iStart; max_ie = gscore > hA2[qlen] ? max_ie : iStart;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
} }
if (m == 0 && m_last == 0) // if (m == 0 && m_last == 0)
break; // 一定要注意,斜对角遍历和按列遍历的不同点 // break; // 一定要注意,斜对角遍历和按列遍历的不同点
if (m > max) if (m > max)
{ {
max = m, max_i = mi, max_j = mj; max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
} }
else if (zdrop > 0) else if (0) // (zdrop > 0)
{ {
if (mi - max_i > mj - max_j) if (mi - max_i > mj - max_j)
{ {
@ -430,9 +454,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
} }
end = j + 1 <= qlen ? j + 1 : qlen; end = j + 1 <= qlen ? j + 1 : qlen;
// beg = 0;
// end = qlen;
m_last = m; m_last = m;
// swap m, h, e, f // swap m, h, e, f
SWAP_DATA_POINTER; SWAP_DATA_POINTER;
#ifdef SHOW_PERF
time_bsw_adjust_bound += get_mseconds() - start_time;
start_time = get_mseconds();
time_compare += get_mseconds() - start_time;
#endif
} }
free(mem); free(mem);

View File

@ -1,6 +1,7 @@
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
#ifdef __GNUC__ #ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1) #define LIKELY(x) __builtin_expect((x), 1)
@ -55,13 +56,13 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
{ {
int t, f = 0, h1, m = 0, mj = -1; int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配 int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配
// apply the band and the constraint (if provided) // apply the band and the constraint (if provided)
// if (beg < i - w) // 检查开始点是否可以缩小一些 if (beg < i - w) // 检查开始点是否可以缩小一些
// beg = i - w; beg = i - w;
// if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
// end = i + w + 1; end = i + w + 1;
// if (end > qlen) // 终点不超过query长度 if (end > qlen) // 终点不超过query长度
// end = qlen; end = qlen;
// compute the first column // compute the first column
if (beg == 0) if (beg == 0)
{ {
@ -73,11 +74,12 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
h1 = 0; h1 = 0;
for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列 for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列
{ {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // fprintf(stderr, "%-3d", h1);
// Similar to SSE2-SW, cells are computed in the following order: // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} // Similar to SSE2-SW, cells are computed in the following order:
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗query // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗querytarget的row id固定query的col index一直增加
eh_t *p = &eh[j]; eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值 int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值
p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果 p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果
@ -97,7 +99,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
f -= e_ins; f -= e_ins;
f = f > t ? f : t; // computed F(i,j+1) f = f > t ? f : t; // computed F(i,j+1)
} }
eh[end].h = h1; eh[end].h = h1; // end是query序列之外的位置
eh[end].e = 0; eh[end].e = 0;
if (j == qlen) // 此轮遍历到了query的最后一个字符 if (j == qlen) // 此轮遍历到了query的最后一个字符
{ {
@ -111,7 +113,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值 max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值
} }
else if (0) //(zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0 else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{ {
if (i - max_i > mj - max_j) if (i - max_i > mj - max_j)
{ {
@ -130,9 +132,10 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
beg = j; beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
; ;
end = j + 2 < qlen ? j + 2 : qlen; end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
beg = 0; beg = 0, end = qlen; // uncomment this line for debugging
end = qlen; // uncomment this line for debugging // fprintf(stderr, "\n");
// fprintf(stderr, "%d\n", end);
} }
free(eh); free(eh);
free(qp); free(qp);

65
main.c
View File

@ -26,7 +26,13 @@ int64_t get_mseconds()
int64_t time_sw_normal = 0, int64_t time_sw_normal = 0,
time_sw_avx2 = 0, time_sw_avx2 = 0,
time_sw_avx2_u8 = 0; time_sw_avx2_u8 = 0,
time_bsw_avx2 = 0,
time_bsw_init = 0,
time_bsw_main_loop = 0,
time_bsw_find_max = 0,
time_bsw_adjust_bound = 0,
time_compare = 0;
#endif #endif
@ -111,18 +117,18 @@ int main(int argc, char *argv[])
// const char *qf_path = "q.fa"; // const char *qf_path = "q.fa";
// const char *tf_path = "t.fa"; // const char *tf_path = "t.fa";
// const char *if_path = "i.txt"; // const char *if_path = "i.txt";
const char *qf_path = "bug_q.fa"; // const char *qf_path = "bug_q.fa";
const char *tf_path = "bug_t.fa"; // const char *tf_path = "bug_t.fa";
const char *if_path = "bug_i.txt"; // const char *if_path = "bug_i.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_s.fa"; // const char *qf_path = "/public/home/zzh/data/sw/q_s.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_s.fa"; // const char *tf_path = "/public/home/zzh/data/sw/t_s.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_s.txt"; // const char *if_path = "/public/home/zzh/data/sw/i_s.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_m.fa"; // const char *qf_path = "/public/home/zzh/data/sw/q_m.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_m.fa"; // const char *tf_path = "/public/home/zzh/data/sw/t_m.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_m.txt"; // const char *if_path = "/public/home/zzh/data/sw/i_m.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_l.fa"; const char *qf_path = "/public/home/zzh/data/sw/q_l.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_l.fa"; const char *tf_path = "/public/home/zzh/data/sw/t_l.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_l.txt"; const char *if_path = "/public/home/zzh/data/sw/i_l.txt";
// const char *qf_path = "/public/home/zzh/data/sw/query.fa"; // const char *qf_path = "/public/home/zzh/data/sw/query.fa";
// const char *tf_path = "/public/home/zzh/data/sw/target.fa"; // const char *tf_path = "/public/home/zzh/data/sw/target.fa";
// const char *if_path = "/public/home/zzh/data/sw/info.txt"; // const char *if_path = "/public/home/zzh/data/sw/info.txt";
@ -134,6 +140,7 @@ int main(int argc, char *argv[])
FILE *normal_out_f = fopen("normal_out.txt", "w"); FILE *normal_out_f = fopen("normal_out.txt", "w");
FILE *avx2_out_f = fopen("avx2_out.txt", "w"); FILE *avx2_out_f = fopen("avx2_out.txt", "w");
FILE *avx2_u8_out_f = fopen("avx2_u8_out.txt", "w"); FILE *avx2_u8_out_f = fopen("avx2_u8_out.txt", "w");
FILE *bsw_avx2_out_f = fopen("bsw_avx2_out.txt", "w");
// 每次读取一定量的数据,然后执行,直到处理完所有数据 // 每次读取一定量的数据,然后执行,直到处理完所有数据
int total_line_num = 0; // 目前处理的总的数据行数 int total_line_num = 0; // 目前处理的总的数据行数
@ -155,8 +162,8 @@ int main(int argc, char *argv[])
j += 3; j += 3;
} }
int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0; int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0, score_bsw_avx2 = 0;
int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0; int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0, score_bsw_avx2_total = 0;
while (!feof(target_f)) while (!feof(target_f))
{ {
@ -237,8 +244,28 @@ int main(int argc, char *argv[])
#endif #endif
score_normal_total += score_normal; score_normal_total += score_normal;
// fprintf(normal_out_f, "%d %d\n", info_arr[i][2], score_normal); // fprintf(normal_out_f, "%d %d\n", info_arr[i][2], score_normal);
fprintf(normal_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]); // fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]);
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
score_bsw_avx2 = bsw_avx2(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_bsw_avx2 += get_mseconds() - start_time;
#endif
score_bsw_avx2_total += score_bsw_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_bsw_avx2_total, qle, tle, gtle, gscore, max_off[0]);
/*
#ifdef SHOW_PERF #ifdef SHOW_PERF
start_time = get_mseconds(); start_time = get_mseconds();
#endif #endif
@ -257,7 +284,8 @@ int main(int argc, char *argv[])
#endif #endif
score_avx2_total += score_avx2; score_avx2_total += score_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2); // fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
fprintf(avx2_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]); fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]);
*/
#ifdef SHOW_PERF #ifdef SHOW_PERF
start_time = get_mseconds(); start_time = get_mseconds();
#endif #endif
@ -275,8 +303,8 @@ int main(int argc, char *argv[])
time_sw_avx2_u8 += get_mseconds() - start_time; time_sw_avx2_u8 += get_mseconds() - start_time;
#endif #endif
score_avx2_u8_total += score_avx2_u8; score_avx2_u8_total += score_avx2_u8;
fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]); // fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]);
// 更新query和target位置信息 // 更新query和target位置信息
cur_query_pos += info_arr[i][0]; cur_query_pos += info_arr[i][0];
cur_target_pos += info_arr[i][1]; cur_target_pos += info_arr[i][1];
// fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]); // fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]);
@ -343,8 +371,15 @@ int main(int argc, char *argv[])
#ifdef SHOW_PERF #ifdef SHOW_PERF
fprintf(stderr, "time_sw_normal: %f s; score: %d\n", time_sw_normal / 1000.0, score_normal_total); fprintf(stderr, "time_sw_normal: %f s; score: %d\n", time_sw_normal / 1000.0, score_normal_total);
fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total); fprintf(stderr, "time_bsw_avx2: %f s; score: %d\n", time_bsw_avx2 / 1000.0, score_bsw_avx2_total);
// fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total);
fprintf(stderr, "time_sw_avx2_u8: %f s; score: %d\n", time_sw_avx2_u8 / 1000.0, score_avx2_u8_total); fprintf(stderr, "time_sw_avx2_u8: %f s; score: %d\n", time_sw_avx2_u8 / 1000.0, score_avx2_u8_total);
fprintf(stderr, "time_bsw_init: %f s\n", time_bsw_init / 1000.0);
fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / 1000.0);
fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / 1000.0);
fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / 1000.0);
fprintf(stderr, "time_compare: %f s\n", time_compare / 1000.0);
#endif #endif
if (query_f != 0) if (query_f != 0)
@ -359,4 +394,6 @@ int main(int argc, char *argv[])
fclose(avx2_u8_out_f); fclose(avx2_u8_out_f);
if (normal_out_f != 0) if (normal_out_f != 0)
fclose(normal_out_f); fclose(normal_out_f);
if (bsw_avx2_out_f != 0)
fclose(bsw_avx2_out_f);
} }