sw_perf/main.c

428 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include "sys/time.h"
#include "thread_mem.h"
#define SW_NORMAL 0
#define SW_AVX2 1
#define SW_CUDA 2
#define SW_ALL 3
#define BLOCK_BUF_SIZE 1048576
#define READ_BUF_SIZE 2048
#define SEQ_BUF_SIZE (BLOCK_BUF_SIZE + READ_BUF_SIZE)
#define DIVIDE_BY (CLOCKS_PER_SEC * 1.0)
#ifdef SHOW_PERF
// 用来调试,计算感兴趣部分的运行时间
// 获取当前毫秒数
int64_t get_mseconds()
{
// struct timeval tv;
// gettimeofday(&tv, NULL);
// return (int64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec));
return clock();
}
int64_t time_sw_normal = 0,
time_sw_avx2 = 0,
time_sw_avx2_u8 = 0,
time_sw_avx2_u8_aligned = 0,
time_bsw_avx2 = 0,
time_bsw_init = 0,
time_bsw_main_loop = 0,
time_bsw_find_max = 0,
time_bsw_adjust_bound = 0,
time_compare = 0;
#endif
extern int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
extern int ksw_avx2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
extern int ksw_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
char t_2bit2char[5] = {'A', 'C', 'G', 'T'};
// 将碱基字符转成2位编码
void convert_char_to_2bit(char *str)
{
int i;
for (i = 0; i < strlen(str); ++i)
str[i] = nst_nt4_table[str[i]];
}
// 读取测试数据
int read_data()
{
return 0;
}
// 程序执行入口
int main(int argc, char *argv[])
{
// 初始化一些全局参数
int8_t mat[25] = {1, -4, -4, -4, -1,
-4, 1, -4, -4, -1,
-4, -4, 1, -4, -1,
-4, -4, -4, 1, -1,
-1, -1, -1, -1, -1};
int max_off[2];
int qle, tle, gtle, gscore;
thread_mem_t tmem, tmem_u8;
init_thread_mem(&tmem);
init_thread_mem(&tmem_u8);
// thread_mem_init_alloc(&tmem_u8, 10960);
// 读取测试数据
char *query_arr = (char *)malloc(SEQ_BUF_SIZE);
char *target_arr = (char *)malloc(SEQ_BUF_SIZE);
int *info_buf = (int *)malloc(SEQ_BUF_SIZE * sizeof(int));
int **info_arr = (int **)malloc(SEQ_BUF_SIZE * sizeof(int *));
FILE *query_f = 0, *target_f = 0, *info_f = 0;
// const char *qf_path = "q.fa";
// const char *tf_path = "t.fa";
// const char *if_path = "i.txt";
// const char *qf_path = "bug_q.fa";
// const char *tf_path = "bug_t.fa";
// const char *if_path = "bug_i.txt";
// const char *qf_path = "/home/zzh/data/sw/q_s.fa";
// const char *tf_path = "/home/zzh/data/sw/t_s.fa";
// const char *if_path = "/home/zzh/data/sw/i_s.txt";
// const char *qf_path = "/home/zzh/data/sw/q_m.fa";
// const char *tf_path = "/home/zzh/data/sw/t_m.fa";
// const char *if_path = "/home/zzh/data/sw/i_m.txt";
const char *qf_path = "/home/zzh/data/sw/q_l.fa";
const char *tf_path = "/home/zzh/data/sw/t_l.fa";
const char *if_path = "/home/zzh/data/sw/i_l.txt";
// const char *qf_path = "/home/zzh/data/sw/query.fa";
// const char *tf_path = "/home/zzh/data/sw/target.fa";
// const char *if_path = "/home/zzh/data/sw/info.txt";
query_f = fopen(qf_path, "r");
target_f = fopen(tf_path, "r");
info_f = fopen(if_path, "r");
// 将每次比对的得分等信息写入文件进行debug
FILE *normal_out_f = fopen("normal_out.txt", "w");
FILE *avx2_out_f = fopen("avx2_out.txt", "w");
FILE *avx2_u8_out_f = fopen("avx2_u8_out.txt", "w");
FILE *bsw_avx2_out_f = fopen("bsw_avx2_out.txt", "w");
// 每次读取一定量的数据,然后执行,直到处理完所有数据
int total_line_num = 0; // 目前处理的总的数据行数
int block_line_num = 0; // 当前循环包含的数据行数
int i, j;
// const int max_read = READ_BUF_SIZE; // 每次最多读取的字符
char read_buf[READ_BUF_SIZE]; // 读文件缓存
// int ret_code = 0;
// 初始化info_arr数组
i = 0;
j = 0;
while (1)
{
if (j > BLOCK_BUF_SIZE)
break;
info_arr[i] = &info_buf[j];
i += 1;
j += 3;
// fprintf(stderr, "%d\t%d\n", i, j);
}
int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0, score_bsw_avx2 = 0;
int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0, score_bsw_avx2_total = 0;
int score_avx2_u8_aligned = 0, score_avx2_u8_aligned_total = 0;
while (!feof(target_f))
{
// fprintf(stderr, "debug\n");
block_line_num = 0;
// target序列一般占用存储最多先读取target看一个buf能读多少行query和info就按照这个行数来读
int cur_read_size = 0;
while (!feof(target_f) && cur_read_size < BLOCK_BUF_SIZE)
{
if (fgets(read_buf, READ_BUF_SIZE, target_f) == NULL)
break;
int line_size = strlen(read_buf);
assert(line_size < READ_BUF_SIZE);
if (read_buf[line_size - 1] == '\n')
{
read_buf[line_size - 1] = '\0';
line_size--;
}
convert_char_to_2bit(read_buf);
++block_line_num;
++total_line_num;
strncpy(target_arr + cur_read_size, read_buf, line_size);
cur_read_size += line_size;
// fprintf(stderr, "%d %d \n", line_size, cur_read_size);
// fprintf(stderr, "%d %d \n", block_line_num, total_line_num);
}
// fprintf(stderr, "here\n");
// 读query
cur_read_size = 0;
for (i = 0; i < block_line_num; ++i)
{
if (fgets(read_buf, READ_BUF_SIZE, query_f) == NULL)
break;
int line_size = strlen(read_buf);
assert(line_size < READ_BUF_SIZE);
if (read_buf[line_size - 1] == '\n')
{
read_buf[line_size - 1] = '\0';
line_size--;
}
convert_char_to_2bit(read_buf);
strncpy(query_arr + cur_read_size, read_buf, line_size);
cur_read_size += line_size;
}
// 读info
cur_read_size = 0;
for (i = 0; i < block_line_num; ++i)
{
if (fgets(read_buf, READ_BUF_SIZE, info_f) == NULL)
break;
const int line_size = strlen(read_buf);
assert(line_size < READ_BUF_SIZE);
sscanf(read_buf, "%d %d %d\n", &info_arr[i][0], &info_arr[i][1], &info_arr[i][2]);
cur_read_size += line_size;
// fprintf(stderr, "%-8d%-8d%-8d\n", info_arr[i][0], info_arr[i][1], info_arr[i][2]);
// fprintf(stderr, "%s\n", read_buf);
}
// 性能测试
int cur_query_pos = 0;
int cur_target_pos = 0;
for (i = 0; i < block_line_num; ++i)
{
#ifdef SHOW_PERF
int64_t start_time = get_mseconds();
#endif
// 普通 sw
score_normal = ksw_normal(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
5, mat, 6, 1, 6, 1, 100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_normal += get_mseconds() - start_time;
#endif
score_normal_total += score_normal;
// fprintf(normal_out_f, "%d %d\n", info_arr[i][2], score_normal);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]);
// #ifdef SHOW_PERF
// start_time = get_mseconds();
// #endif
// score_bsw_avx2 = ksw_avx2_aligned(
// &tmem,
// info_arr[i][0],
// (uint8_t *)query_arr + cur_query_pos,
// info_arr[i][1],
// (uint8_t *)target_arr + cur_target_pos,
// 0, 6, 1, 6, 1,
// 1, 4,
// 100, 5,
// info_arr[i][2],
// &qle, &tle, &gtle, &gscore, &max_off[0]);
// #ifdef SHOW_PERF
// time_bsw_avx2 += get_mseconds() - start_time;
// #endif
// score_bsw_avx2_total += score_bsw_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_bsw_avx2_total, qle, tle, gtle, gscore, max_off[0]);
/**/
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
score_avx2 = bsw_avx2(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 6, 1, 6, 1,
1, 4,
100, 5,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
// score_avx2 = ksw_avx2(
// info_arr[i][0],
// (uint8_t *)query_arr + cur_query_pos,
// info_arr[i][1],
// (uint8_t *)target_arr + cur_target_pos,
// 0, 5, mat, 6, 1, 6, 1,
// 1, 4,
// 100, 5, 100,
// info_arr[i][2],
// &qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_avx2 += get_mseconds() - start_time;
#endif
score_avx2_total += score_avx2;
// fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2);
// fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]);
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
score_avx2_u8 = ksw_avx2_u8(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_avx2_u8 += get_mseconds() - start_time;
#endif
score_avx2_u8_total += score_avx2_u8;
// fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]);
#ifdef SHOW_PERF
start_time = get_mseconds();
#endif
score_avx2_u8_aligned = ksw_avx2_u8_aligned(
&tmem_u8,
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_avx2_u8_aligned += get_mseconds() - start_time;
#endif
score_avx2_u8_aligned_total += score_avx2_u8_aligned;
// fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]);
// 更新query和target位置信息
cur_query_pos += info_arr[i][0];
cur_target_pos += info_arr[i][1];
// fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]);
}
/*
// avx2 sw
cur_query_pos = 0;
cur_target_pos = 0;
for (i = 0; i < block_line_num; ++i)
{
#ifdef SHOW_PERF
int64_t start_time = get_mseconds();
#endif
score_avx2 += ksw_avx2(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_avx2 += get_mseconds() - start_time;
#endif
// 更新query和target位置信息
cur_query_pos += info_arr[i][0];
cur_target_pos += info_arr[i][1];
// fprintf(stderr, "%d %d %d %d %d %d %d\n", score_avx2, qle, tle, gtle, gscore, max_off[0], max_off[1]);
}
// avx2 u8 sw
cur_query_pos = 0;
cur_target_pos = 0;
for (i = 0; i < block_line_num; ++i)
{
#ifdef SHOW_PERF
int64_t start_time = get_mseconds();
#endif
score_avx2_u8 += ksw_avx2_u8(
info_arr[i][0],
(uint8_t *)query_arr + cur_query_pos,
info_arr[i][1],
(uint8_t *)target_arr + cur_target_pos,
0, 5, mat, 6, 1, 6, 1,
1, 4,
100, 5, 100,
info_arr[i][2],
&qle, &tle, &gtle, &gscore, &max_off[0]);
#ifdef SHOW_PERF
time_sw_avx2_u8 += get_mseconds() - start_time;
#endif
// 更新query和target位置信息
cur_query_pos += info_arr[i][0];
cur_target_pos += info_arr[i][1];
// fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]);
}
*/
// fprintf(stderr, "%d %d \n", block_line_num, total_line_num);
}
// fprintf(stderr, "%d \n", score_normal);
#ifdef SHOW_PERF
fprintf(stderr, "time_sw_normal: %f s; score: %d\n", time_sw_normal / DIVIDE_BY, score_normal_total);
fprintf(stderr, "time_bsw_avx2: %f s; score: %d\n", time_bsw_avx2 / DIVIDE_BY, score_bsw_avx2_total);
fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / DIVIDE_BY, score_avx2_total);
fprintf(stderr, "time_sw_avx2_u8: %f s; score: %d\n", time_sw_avx2_u8 / DIVIDE_BY, score_avx2_u8_total);
fprintf(stderr, "time_sw_avx2_u8_aligned: %f s; score: %d\n", time_sw_avx2_u8_aligned / DIVIDE_BY, score_avx2_u8_aligned_total);
fprintf(stderr, "thread mem capacity: %d\t%d\n", tmem.capacity, tmem_u8.capacity);
fprintf(stderr, "time_bsw_init: %f s\n", time_bsw_init / DIVIDE_BY);
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
#endif
if (query_f != 0)
fclose(query_f);
if (target_f != 0)
fclose(target_f);
if (info_f != 0)
fclose(info_f);
if (avx2_out_f != 0)
fclose(avx2_out_f);
if (avx2_u8_out_f != 0)
fclose(avx2_u8_out_f);
if (normal_out_f != 0)
fclose(normal_out_f);
if (bsw_avx2_out_f != 0)
fclose(bsw_avx2_out_f);
}