diff --git a/align.c b/align.c index a723a77..c9370e7 100644 --- a/align.c +++ b/align.c @@ -25,31 +25,31 @@ const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1}; int i; \ kswr_t score; \ for (i = sp; i < ep; ++i) \ - { if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \ + { if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \ kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ - score = func(&bmem[1], q, \ + score = func(&bmem[1], q, \ kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \ 6, 1, 6, 1, xtra); \ - score_total[kernel_id] += score.score; \ + score_total[kernel_id] += score.score; \ byte_mem_clear(&bmem[0]); /* free(q); */ \ } \ PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ } while (0) -#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \ - do { \ - PROF_START(align); \ - int i; \ - kswr_t score; \ - for (i = sp; i < ep; ++i) { \ - /*if (kv_A(kv_A(i_arr, i), 0) < 144) \ - continue; */ \ +#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \ + do { \ + PROF_START(align); \ + int i; \ + kswr_t score; \ + for (i = sp; i < ep; ++i) { \ + if (kv_A(kv_A(i_arr, i), 0) < 144) \ + continue; \ kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ - score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \ - score_total[kernel_id] += score.score; \ - byte_mem_clear(&bmem[0]); /* free(q); */ \ - } \ - PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ + score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \ + score_total[kernel_id] += score.score; \ + byte_mem_clear(&bmem[0]); /* free(q); */ \ + } \ + PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ } while (0) // sse ksw init @@ -233,6 +233,7 @@ int main_align(int argc, char *argv[]) ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines); ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines); ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines); + #if 0 // compare the score2 of i16 and u8 { @@ -262,8 +263,8 @@ int main_align(int argc, char *argv[]) } #ifdef SHOW_PERF - fprintf(stderr, "[align sse i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]); - fprintf(stderr, "[align sse u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]); + fprintf(stderr, "[align avx i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]); + fprintf(stderr, "[align avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]); fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]); fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]); #endif diff --git a/align.h b/align.h index b20d467..d0aa789 100644 --- a/align.h +++ b/align.h @@ -51,6 +51,5 @@ kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t * int _e_ins, int xtra); kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra); - int main_align(int argc, char *argv[]); -#endif \ No newline at end of file +#endif diff --git a/extend_avx2_i16.c b/extend_avx2_i16.c index 6a2c2bb..d6770e5 100644 --- a/extend_avx2_i16.c +++ b/extend_avx2_i16.c @@ -321,6 +321,7 @@ int extend_avx2_i16(byte_mem_t *bmem, } ins[0][0] = del[0][0] = score[0][0] = h0; #endif + w = 1000; for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 diff --git a/extend_avx2_u8.c b/extend_avx2_u8.c index 82f3591..cfb2244 100644 --- a/extend_avx2_u8.c +++ b/extend_avx2_u8.c @@ -175,7 +175,7 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ m = MAX(m, maxVal[0]); \ - if (maxVal[0] > 0 && m >= max) \ + /*if (maxVal[0] > 0 && m >= max) \ { \ for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ { \ @@ -189,7 +189,7 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, mi = i - 1 - pos; \ } \ } \ - } + }*/ // 每轮迭代后,交换数组 #define SWAP_DATA_POINTER \ @@ -340,6 +340,7 @@ int extend_avx2_u8(byte_mem_t *bmem, #endif #endif + w = 1000; for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 diff --git a/extend_scalar.c b/extend_scalar.c index 1848090..15714cb 100644 --- a/extend_scalar.c +++ b/extend_scalar.c @@ -70,6 +70,7 @@ int extend_scalar(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, co max_ie = -1, gscore = -1; max_off = 0; beg = 0, end = qlen; + w = 1000; for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 { int t, f = 0, h1, m = 0, mj = -1; diff --git a/profiling.h b/profiling.h index 3bb55e8..5427f26 100644 --- a/profiling.h +++ b/profiling.h @@ -44,4 +44,4 @@ enum // get current milli seconds uint64_t get_msec(); -#endif \ No newline at end of file +#endif