对avx2实现的sw的各个部分运行时间进行了测试

This commit is contained in:
zzh 2023-08-18 15:42:46 +08:00
parent 2a14c17c68
commit 53332e34e1
5 changed files with 133 additions and 60 deletions

View File

@ -1,8 +1,8 @@
CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2
#CFLAGS= -Wall -Wno-unused-function -O2 -mavx2
#CFLAGS= -g -Wall -Wno-unused-function -mavx2
CFLAGS= -Wall -Wno-unused-function -O2 -mavx2
DFLAGS= -DSHOW_PERF
OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o
OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o bsw_avx2.o
PROG= sw_perf
PROG2= sw_perf_discrete
INCLUDES=

View File

@ -197,7 +197,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
int16_t *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int Dloop = tlen + qlen; // 循环跳出条件 D从1开始遍历
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
@ -287,7 +287,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
{
span = MIN(Dloop - D, w);
span = MIN(Dloop - D, w); // 计算的窗口,或者说范围
beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1);
}
else
@ -372,7 +372,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0)
else if (0) //(zdrop > 0)
{
if (mi - max_i > mj - max_j)
{
@ -387,24 +387,25 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
}
// 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
else
hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
// beg = 0;
// end = qlen; // uncomment this line for debugging
/* for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
else
hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
*/
beg = 0;
end = qlen; // uncomment this line for debugging
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;

View File

@ -217,6 +217,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
#ifdef SHOW_PERF
extern int64_t time_bsw_init;
extern int64_t time_bsw_main_loop;
extern int64_t time_bsw_find_max;
extern int64_t time_bsw_adjust_bound;
extern int64_t time_compare;
int64_t start_time = get_mseconds();
#endif
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem;
@ -307,9 +315,15 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int m_last = 0;
int iend;
#ifdef SHOW_PERF
time_bsw_init += get_mseconds() - start_time;
#endif
for (D = 1; LIKELY(D < Dloop); ++D)
{
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
{
@ -380,9 +394,19 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
// 存储结果
SIMD_STORE;
}
#ifdef SHOW_PERF
time_bsw_main_loop += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
SIMD_FIND_MAX;
#ifdef SHOW_PERF
time_bsw_find_max += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
// 注意最后跳出循环j的值
j = end + 1;
@ -391,14 +415,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m == 0 && m_last == 0)
break; // 一定要注意,斜对角遍历和按列遍历的不同点
// if (m == 0 && m_last == 0)
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
if (m > max)
{
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0)
else if (0) // (zdrop > 0)
{
if (mi - max_i > mj - max_j)
{
@ -430,9 +454,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
}
end = j + 1 <= qlen ? j + 1 : qlen;
// beg = 0;
// end = qlen;
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
#ifdef SHOW_PERF
time_bsw_adjust_bound += get_mseconds() - start_time;
start_time = get_mseconds();
time_compare += get_mseconds() - start_time;
#endif
}
free(mem);

View File

@ -1,6 +1,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
@ -55,13 +56,13 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
{
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符query中每个字符的分值只有匹配和不匹配
// apply the band and the constraint (if provided)
// if (beg < i - w) // 检查开始点是否可以缩小一些
// beg = i - w;
// if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
// end = i + w + 1;
// if (end > qlen) // 终点不超过query长度
// end = qlen;
// apply the band and the constraint (if provided)
if (beg < i - w) // 检查开始点是否可以缩小一些
beg = i - w;
if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小
end = i + w + 1;
if (end > qlen) // 终点不超过query长度
end = qlen;
// compute the first column
if (beg == 0)
{
@ -73,11 +74,12 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
h1 = 0;
for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列
{
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗query
// fprintf(stderr, "%-3d", h1);
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete只消耗target
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert只消耗querytarget的row id固定query的col index一直增加
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值
p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果
@ -97,7 +99,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
f -= e_ins;
f = f > t ? f : t; // computed F(i,j+1)
}
eh[end].h = h1;
eh[end].h = h1; // end是query序列之外的位置
eh[end].e = 0;
if (j == qlen) // 此轮遍历到了query的最后一个字符
{
@ -111,7 +113,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值
}
else if (0) //(zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{
if (i - max_i > mj - max_j)
{
@ -130,9 +132,10 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
;
end = j + 2 < qlen ? j + 2 : qlen;
beg = 0;
end = qlen; // uncomment this line for debugging
end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
beg = 0, end = qlen; // uncomment this line for debugging
// fprintf(stderr, "\n");
// fprintf(stderr, "%d\n", end);
}
free(eh);
free(qp);

65
main.c
View File

@ -26,7 +26,13 @@ int64_t get_mseconds()
int64_t time_sw_normal = 0,
time_sw_avx2 = 0,
time_sw_avx2_u8 = 0;
time_sw_avx2_u8 = 0,
time_bsw_avx2 = 0,
time_bsw_init = 0,
time_bsw_main_loop = 0,
time_bsw_find_max = 0,
time_bsw_adjust_bound = 0,
time_compare = 0;
#endif
@ -111,18 +117,18 @@ int main(int argc, char *argv[])
// const char *qf_path = "q.fa";
// const char *tf_path = "t.fa";
// const char *if_path = "i.txt";
const char *qf_path = "bug_q.fa";
const char *tf_path = "bug_t.fa";
const char *if_path = "bug_i.txt";
// const char *qf_path = "bug_q.fa";
// const char *tf_path = "bug_t.fa";
// const char *if_path = "bug_i.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_s.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_s.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_s.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_m.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_m.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_m.txt";
// const char *qf_path = "/public/home/zzh/data/sw/q_l.fa";
// const char *tf_path = "/public/home/zzh/data/sw/t_l.fa";
// const char *if_path = "/public/home/zzh/data/sw/i_l.txt";
const char *qf_path = "/public/home/zzh/data/sw/q_l.fa";
const char *tf_path = "/public/home/zzh/data/sw/t_l.fa";
const char *if_path = "/public/home/zzh/data/sw/i_l.txt";
// const char *qf_path = "/public/home/zzh/data/sw/query.fa";
// const char *tf_path = "/public/home/zzh/data/sw/target.fa";
// const char *if_path = "/public/home/zzh/data/sw/info.txt";
@ -134,6 +140,7 @@ int main(int argc, char *argv[])
FILE *normal_out_f = fopen("normal_out.txt", "w");
FILE *avx2_out_f = fopen("avx2_out.txt", "w");
FILE *avx2_u8_out_f = fopen("avx2_u8_out.txt", "w");
FILE *bsw_avx2_out_f = fopen("bsw_avx2_out.txt", "w");
// 每次读取一定量的数据,然后执行,直到处理完所有数据
int total_line_num = 0; // 目前处理的总的数据行数
@ -155,8 +162,8 @@ int main(int argc, char *argv[])
j += 3;
}
int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0;
int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0;
int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0, score_bsw_avx2 = 0;
int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0, score_bsw_avx2_total = 0;
while (!feof(target_f))
{
@ -237,8 +244,28 @@ int main(int argc, char *argv[])
#endif
score_normal_total += score_normal;
// fprintf(normal_out_f, "%d %d\n", info_arr[i][2], score_normal);
fprintf(normal_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]);
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
score_bsw_avx2 = bsw_avx2(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_bsw_avx2 += get_mseconds() - start_time;
#endif
score_bsw_avx2_total += score_bsw_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_bsw_avx2_total, qle, tle, gtle, gscore, max_off[0]);
/*
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
@ -257,7 +284,8 @@ int main(int argc, char *argv[])
#endif
score_avx2_total += score_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
fprintf(avx2_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]);
fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]);
*/
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
@ -275,8 +303,8 @@ int main(int argc, char *argv[])
time_sw_avx2_u8 += get_mseconds() - start_time;
#endif
score_avx2_u8_total += score_avx2_u8;
fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]);
// 更新query和target位置信息
// fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]);
// 更新query和target位置信息
cur_query_pos += info_arr[i][0];
cur_target_pos += info_arr[i][1];
// fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]);
@ -343,8 +371,15 @@ int main(int argc, char *argv[])
#ifdef SHOW_PERF
fprintf(stderr, "time_sw_normal: %f s; score: %d\n", time_sw_normal / 1000.0, score_normal_total);
fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total);
fprintf(stderr, "time_bsw_avx2: %f s; score: %d\n", time_bsw_avx2 / 1000.0, score_bsw_avx2_total);
// fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total);
fprintf(stderr, "time_sw_avx2_u8: %f s; score: %d\n", time_sw_avx2_u8 / 1000.0, score_avx2_u8_total);
fprintf(stderr, "time_bsw_init: %f s\n", time_bsw_init / 1000.0);
fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / 1000.0);
fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / 1000.0);
fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / 1000.0);
fprintf(stderr, "time_compare: %f s\n", time_compare / 1000.0);
#endif
if (query_f != 0)
@ -359,4 +394,6 @@ int main(int argc, char *argv[])
fclose(avx2_u8_out_f);
if (normal_out_f != 0)
fclose(normal_out_f);
if (bsw_avx2_out_f != 0)
fclose(bsw_avx2_out_f);
}