diff --git a/Makefile b/Makefile index c9b3a6f..fba64a4 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ CC= gcc -CFLAGS= -g -Wall -Wno-unused-function -mavx2 -#CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 +#CFLAGS= -g -Wall -Wno-unused-function -mavx2 +CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 DFLAGS= -DSHOW_PERF -OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o +OBJS= ksw_normal.o ksw_avx2.o ksw_cuda.o ksw_avx2_u8.o bsw_avx2.o PROG= sw_perf PROG2= sw_perf_discrete INCLUDES= diff --git a/bsw_avx2.c b/bsw_avx2.c index 1054f7c..3c76c0e 100644 --- a/bsw_avx2.c +++ b/bsw_avx2.c @@ -197,7 +197,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query int16_t *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; int i, iStart, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - int Dloop = tlen + qlen; // 循环跳出条件 + int Dloop = tlen + qlen; // 循环跳出条件 D从1开始遍历 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 @@ -287,7 +287,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 if (D > tlen) { - span = MIN(Dloop - D, w); + span = MIN(Dloop - D, w); // 计算的窗口,或者说范围 beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1); } else @@ -372,7 +372,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query max = m, max_i = mi, max_j = mj; max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); } - else if (zdrop > 0) + else if (0) //(zdrop > 0) { if (mi - max_i > mj - max_j) { @@ -387,24 +387,25 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query } // 调整计算的边界 - for (j = beg; LIKELY(j <= end); ++j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - break; - } - beg = j; - for (j = end + 1; LIKELY(j >= beg); --j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - break; - else - hA0[j - 1] = 0; - } - end = j + 1 <= qlen ? j + 1 : qlen; - // beg = 0; - // end = qlen; // uncomment this line for debugging + /* for (j = beg; LIKELY(j <= end); ++j) + { + int has_val = hA1[j - 1] | hA2[j]; + if (has_val) + break; + } + beg = j; + for (j = end + 1; LIKELY(j >= beg); --j) + { + int has_val = hA1[j - 1] | hA2[j]; + if (has_val) + break; + else + hA0[j - 1] = 0; + } + end = j + 1 <= qlen ? j + 1 : qlen; + */ + beg = 0; + end = qlen; // uncomment this line for debugging m_last = m; // swap m, h, e, f SWAP_DATA_POINTER; diff --git a/ksw_avx2_u8.c b/ksw_avx2_u8.c index aeb1554..0b2b7cb 100644 --- a/ksw_avx2_u8.c +++ b/ksw_avx2_u8.c @@ -217,6 +217,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que int *_gscore, // query的端到端匹配得分 int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 { +#ifdef SHOW_PERF + extern int64_t time_bsw_init; + extern int64_t time_bsw_main_loop; + extern int64_t time_bsw_find_max; + extern int64_t time_bsw_adjust_bound; + extern int64_t time_compare; + int64_t start_time = get_mseconds(); +#endif uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M uint8_t *seq, *ref; uint8_t *mem, *qtmem, *vmem; @@ -307,9 +315,15 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que int m_last = 0; int iend; +#ifdef SHOW_PERF + time_bsw_init += get_mseconds() - start_time; +#endif for (D = 1; LIKELY(D < Dloop); ++D) { +#ifdef SHOW_PERF + start_time = get_mseconds(); +#endif // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 if (D > tlen) { @@ -380,9 +394,19 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que // 存储结果 SIMD_STORE; } - +#ifdef SHOW_PERF + time_bsw_main_loop += get_mseconds() - start_time; +#endif +#ifdef SHOW_PERF + start_time = get_mseconds(); +#endif SIMD_FIND_MAX; - +#ifdef SHOW_PERF + time_bsw_find_max += get_mseconds() - start_time; +#endif +#ifdef SHOW_PERF + start_time = get_mseconds(); +#endif // 注意最后跳出循环j的值 j = end + 1; @@ -391,14 +415,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que max_ie = gscore > hA2[qlen] ? max_ie : iStart; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } - if (m == 0 && m_last == 0) - break; // 一定要注意,斜对角遍历和按列遍历的不同点 + // if (m == 0 && m_last == 0) + // break; // 一定要注意,斜对角遍历和按列遍历的不同点 if (m > max) { max = m, max_i = mi, max_j = mj; max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); } - else if (zdrop > 0) + else if (0) // (zdrop > 0) { if (mi - max_i > mj - max_j) { @@ -430,9 +454,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que } end = j + 1 <= qlen ? j + 1 : qlen; + // beg = 0; + // end = qlen; + m_last = m; // swap m, h, e, f SWAP_DATA_POINTER; +#ifdef SHOW_PERF + time_bsw_adjust_bound += get_mseconds() - start_time; + start_time = get_mseconds(); + time_compare += get_mseconds() - start_time; +#endif } free(mem); diff --git a/ksw_normal.c b/ksw_normal.c index 1ef1682..d90b755 100644 --- a/ksw_normal.c +++ b/ksw_normal.c @@ -1,6 +1,7 @@ #include #include #include +#include #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x), 1) @@ -55,13 +56,13 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, { int t, f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[target[i] * qlen]; // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 - // apply the band and the constraint (if provided) - // if (beg < i - w) // 检查开始点是否可以缩小一些 - // beg = i - w; - // if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 - // end = i + w + 1; - // if (end > qlen) // 终点不超过query长度 - // end = qlen; + // apply the band and the constraint (if provided) + if (beg < i - w) // 检查开始点是否可以缩小一些 + beg = i - w; + if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 + end = i + w + 1; + if (end > qlen) // 终点不超过query长度 + end = qlen; // compute the first column if (beg == 0) { @@ -73,11 +74,12 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h1 = 0; for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列 { - // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) - // Similar to SSE2-SW, cells are computed in the following order: - // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete,只消耗target - // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert,只消耗query + // fprintf(stderr, "%-3d", h1); + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete,只消耗target + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert,只消耗query,target的row id固定,query的col index一直增加 eh_t *p = &eh[j]; int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值 p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果 @@ -97,7 +99,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, f -= e_ins; f = f > t ? f : t; // computed F(i,j+1) } - eh[end].h = h1; + eh[end].h = h1; // end是query序列之外的位置 eh[end].e = 0; if (j == qlen) // 此轮遍历到了query的最后一个字符 { @@ -111,7 +113,7 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值 } - else if (0) //(zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 + else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 { if (i - max_i > mj - max_j) { @@ -130,9 +132,10 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, beg = j; for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) ; - end = j + 2 < qlen ? j + 2 : qlen; - beg = 0; - end = qlen; // uncomment this line for debugging + end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert + beg = 0, end = qlen; // uncomment this line for debugging + // fprintf(stderr, "\n"); + // fprintf(stderr, "%d\n", end); } free(eh); free(qp); diff --git a/main.c b/main.c index 20db7cb..d6d6ac5 100644 --- a/main.c +++ b/main.c @@ -26,7 +26,13 @@ int64_t get_mseconds() int64_t time_sw_normal = 0, time_sw_avx2 = 0, - time_sw_avx2_u8 = 0; + time_sw_avx2_u8 = 0, + time_bsw_avx2 = 0, + time_bsw_init = 0, + time_bsw_main_loop = 0, + time_bsw_find_max = 0, + time_bsw_adjust_bound = 0, + time_compare = 0; #endif @@ -111,18 +117,18 @@ int main(int argc, char *argv[]) // const char *qf_path = "q.fa"; // const char *tf_path = "t.fa"; // const char *if_path = "i.txt"; - const char *qf_path = "bug_q.fa"; - const char *tf_path = "bug_t.fa"; - const char *if_path = "bug_i.txt"; + // const char *qf_path = "bug_q.fa"; + // const char *tf_path = "bug_t.fa"; + // const char *if_path = "bug_i.txt"; // const char *qf_path = "/public/home/zzh/data/sw/q_s.fa"; // const char *tf_path = "/public/home/zzh/data/sw/t_s.fa"; // const char *if_path = "/public/home/zzh/data/sw/i_s.txt"; // const char *qf_path = "/public/home/zzh/data/sw/q_m.fa"; // const char *tf_path = "/public/home/zzh/data/sw/t_m.fa"; // const char *if_path = "/public/home/zzh/data/sw/i_m.txt"; - // const char *qf_path = "/public/home/zzh/data/sw/q_l.fa"; - // const char *tf_path = "/public/home/zzh/data/sw/t_l.fa"; - // const char *if_path = "/public/home/zzh/data/sw/i_l.txt"; + const char *qf_path = "/public/home/zzh/data/sw/q_l.fa"; + const char *tf_path = "/public/home/zzh/data/sw/t_l.fa"; + const char *if_path = "/public/home/zzh/data/sw/i_l.txt"; // const char *qf_path = "/public/home/zzh/data/sw/query.fa"; // const char *tf_path = "/public/home/zzh/data/sw/target.fa"; // const char *if_path = "/public/home/zzh/data/sw/info.txt"; @@ -134,6 +140,7 @@ int main(int argc, char *argv[]) FILE *normal_out_f = fopen("normal_out.txt", "w"); FILE *avx2_out_f = fopen("avx2_out.txt", "w"); FILE *avx2_u8_out_f = fopen("avx2_u8_out.txt", "w"); + FILE *bsw_avx2_out_f = fopen("bsw_avx2_out.txt", "w"); // 每次读取一定量的数据,然后执行,直到处理完所有数据 int total_line_num = 0; // 目前处理的总的数据行数 @@ -155,8 +162,8 @@ int main(int argc, char *argv[]) j += 3; } - int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0; - int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0; + int score_normal = 0, score_avx2 = 0, score_avx2_u8 = 0, score_bsw_avx2 = 0; + int score_normal_total = 0, score_avx2_total = 0, score_avx2_u8_total = 0, score_bsw_avx2_total = 0; while (!feof(target_f)) { @@ -237,8 +244,28 @@ int main(int argc, char *argv[]) #endif score_normal_total += score_normal; // fprintf(normal_out_f, "%d %d\n", info_arr[i][2], score_normal); - fprintf(normal_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]); + // fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_normal, qle, tle, gtle, gscore, max_off[0]); +#ifdef SHOW_PERF + start_time = get_mseconds(); +#endif + score_bsw_avx2 = bsw_avx2( + info_arr[i][0], + (uint8_t *)query_arr + cur_query_pos, + info_arr[i][1], + (uint8_t *)target_arr + cur_target_pos, + 0, 5, mat, 6, 1, 6, 1, + 1, 4, + 100, 5, 100, + info_arr[i][2], + &qle, &tle, >le, &gscore, &max_off[0]); +#ifdef SHOW_PERF + time_bsw_avx2 += get_mseconds() - start_time; +#endif + score_bsw_avx2_total += score_bsw_avx2; + // fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2); + // fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_bsw_avx2_total, qle, tle, gtle, gscore, max_off[0]); +/* #ifdef SHOW_PERF start_time = get_mseconds(); #endif @@ -257,7 +284,8 @@ int main(int argc, char *argv[]) #endif score_avx2_total += score_avx2; // fprintf(avx2_out_f, "%d %d\n", info_arr[i][2], score_avx2); - fprintf(avx2_out_f, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]); + fprintf(stderr, "%d %d %d %d %d %d %d\n", info_arr[i][2], score_avx2, qle, tle, gtle, gscore, max_off[0]); +*/ #ifdef SHOW_PERF start_time = get_mseconds(); #endif @@ -275,8 +303,8 @@ int main(int argc, char *argv[]) time_sw_avx2_u8 += get_mseconds() - start_time; #endif score_avx2_u8_total += score_avx2_u8; - fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]); - // 更新query和target位置信息 + // fprintf(avx2_u8_out_f, "%d %d %d %d %d %d\n", score_avx2_u8, qle, tle, gtle, gscore, max_off[0]); + // 更新query和target位置信息 cur_query_pos += info_arr[i][0]; cur_target_pos += info_arr[i][1]; // fprintf(stderr, "%d %d %d %d %d %d %d\n", score_normal, qle, tle, gtle, gscore, max_off[0], max_off[1]); @@ -343,8 +371,15 @@ int main(int argc, char *argv[]) #ifdef SHOW_PERF fprintf(stderr, "time_sw_normal: %f s; score: %d\n", time_sw_normal / 1000.0, score_normal_total); - fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total); + fprintf(stderr, "time_bsw_avx2: %f s; score: %d\n", time_bsw_avx2 / 1000.0, score_bsw_avx2_total); + // fprintf(stderr, "time_sw_avx2: %f s; score: %d\n", time_sw_avx2 / 1000.0, score_avx2_total); fprintf(stderr, "time_sw_avx2_u8: %f s; score: %d\n", time_sw_avx2_u8 / 1000.0, score_avx2_u8_total); + + fprintf(stderr, "time_bsw_init: %f s\n", time_bsw_init / 1000.0); + fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / 1000.0); + fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / 1000.0); + fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / 1000.0); + fprintf(stderr, "time_compare: %f s\n", time_compare / 1000.0); #endif if (query_f != 0) @@ -359,4 +394,6 @@ int main(int argc, char *argv[]) fclose(avx2_u8_out_f); if (normal_out_f != 0) fclose(normal_out_f); + if (bsw_avx2_out_f != 0) + fclose(bsw_avx2_out_f); } \ No newline at end of file