/********************************************************************************************* Description: stripped sw align functions in bwa-mem Copyright : All right reserved by NCIC.ICT Author : Zhang Zhonghai Date : 2024/04/11 ***********************************************************************************************/ // u8和i16 score2 结果有不同,不是算法问题,而是边界未清零 #include #include #include "byte_alloc.h" #include "utils.h" #include "profiling.h" #include "align.h" #include "debug.h" const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1}; #define ALIGN_PERFORMANCE_TEST(kernel_id, nbyte, func, sp, ep) \ do \ { \ PROF_START(align); \ int i; \ kswr_t score; \ for (i = sp; i < ep; ++i) \ { if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \ kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ score = func(&bmem[1], q, \ kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \ 6, 1, 6, 1, xtra); \ score_total[kernel_id] += score.score; \ byte_mem_clear(&bmem[0]); /* free(q); */ \ } \ PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ } while (0) #define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \ do { \ PROF_START(align); \ int i; \ kswr_t score; \ for (i = sp; i < ep; ++i) { \ /*if (kv_A(kv_A(i_arr, i), 0) < 144) \ continue; */ \ kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \ score_total[kernel_id] += score.score; \ byte_mem_clear(&bmem[0]); /* free(q); */ \ } \ PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ } while (0) // sse ksw init /** * Initialize the query data structure * * @param size Number of bytes used to store a score; valid valures are 1 or 2 * @param qlen Length of the query sequence * @param query Query sequence * @param m Size of the alphabet * @param mat Scoring matrix in a one-dimension array * * @return Query data structure */ kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { kswq_sse_t *q; int slen, a, tmp, p, memsize; size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i qlen = 144; slen = (qlen + p - 1) / p; // segmented length memsize = sizeof(kswq_sse_t) + 256 + 16 * slen * (m + 4); q = (kswq_sse_t *)malloc(memsize); // a single block of memory //q = (kswq_sse_t *)byte_mem_request_and_clean(bmem, memsize); //q = (kswq_sse_t *)byte_mem_request(bmem, memsize); q->qp = (__m128i *)(((size_t)q + sizeof(kswq_sse_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; q->Hmax = q->E + slen; q->slen = slen; q->qlen = qlen; q->size = size; //fprintf(stderr, "%d %d\n", slen, qlen); // compute shift tmp = m * m; for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; } q->max = q->mdiff; q->shift = 256 - q->shift; // NB: q->shift is uint8_t q->mdiff += q->shift; // this is the difference between the min and max scores // An example: p=8, qlen=19, slen=3 and segmentation: // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} if (size == 1) { int8_t *t = (int8_t*)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; } } else { int16_t *t = (int16_t*)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen? 0 : ma[query[k]]); } } return q; } /** * Initialize the query data structure * * @param size Number of bytes used to store a score; valid valures are 1 or 2 * @param qlen Length of the query sequence * @param query Query sequence * @param m Size of the alphabet (ACGTN) * @param mat Scoring matrix in a one-dimension array * * @return Query data structure */ kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { kswq_avx2_t *q; int slen, a, tmp, p; size = size > 1 ? 2 : 1; p = 16 * (3 - size); // # values per __m256i,i16是16个,u8是32个 slen = (qlen + p - 1) / p; // segmented length q = (kswq_avx2_t *)malloc(sizeof(kswq_avx2_t) + 512 + 32 * slen * (m + 4)); // a single block of memory q->qp = (__m256i *)(((size_t)q + sizeof(kswq_avx2_t) + 31) >> 5 << 5); // align memory,32字节对齐 q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; q->Hmax = q->E + slen; q->slen = slen; q->qlen = qlen; q->size = size; // compute shift tmp = m * m; for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; } q->max = q->mdiff; // (=1) q->shift = 256 - q->shift; // NB: q->shift is uint8_t // 主要用来处理uint8_t类型数据,保证score不小于0,i16不用 (=4) q->mdiff += q->shift; // this is the difference between the min and max scores (=1) // An example: p=8, qlen=19, slen=3 and segmentation: // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} if (size == 1) { int8_t *t = (int8_t *)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen ? 0 : ma[query[k]]) + q->shift; } } else { int16_t *t = (int16_t *)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen ? 0 : ma[query[k]]); } } return q; } /****** * 输入说明:三个文件,query.fa, target.fa, info.txt * query.fa: 每一行代表一个query序列,由ACGTN组成 * target.fa: 每一行代表一个reference(target)序列,由ACGTN组成 * info.txt: 每一行由两个数字组成,分别代表query序列长度,target序列长度(align不需要初始分数) */ int main_align(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "Need 3 files: query, target, info.\n"); return -1; } const char *qf_path = argv[0]; const char *tf_path = argv[1]; const char *if_path = argv[2]; FILE *qfp = fopen(qf_path, "r"); FILE *tfp = fopen(tf_path, "r"); FILE *ifp = fopen(if_path, "r"); buf_t read_buf = {0}; seq_v q_arr = {0}; seq_v t_arr = {0}; qti_v i_arr = {0}; uint64_t score_total[ALIGN_FUNC_NUM] = {0}; const int kmax_row = 3000000; /* read input files */ int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp); int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp); int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 2, ifp); // fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row); /* init parameters */ int8_t mat[25] = {1, -4, -4, -4, -1, -4, 1, -4, -4, -1, -4, -4, 1, -4, -1, -4, -4, -4, 1, -1, -1, -1, -1, -1, -1}; int xtra = 851987; int kernel_prof_idx[] = {G_ALN_I16, G_ALN_U8, G_ALN_AVX2_I16, G_ALN_AVX2_U8}; byte_mem_t bmem[2] = {{0}, {0}}; byte_mem_init_alloc(&bmem[0], 1024*4); byte_mem_init_alloc(&bmem[1], 1024*4); int align_lines = MIN(MIN(query_read_row, target_read_row), info_read_row); // open_qti_files(); // open_debug_files(); /* convert query sequence for sse or avx2 */ fprintf(stderr, "excute nums: %d\n", align_lines); ALIGN_PERFORMANCE_TEST(0, 2, align_sse_i16, 0, align_lines); ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines); ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines); ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines); #if 0 // compare the score2 of i16 and u8 { int i; kswr_t si16, su8; for (i = 0; i < align_lines; ++i) { kswq_sse_t *qi16 = aln_sse_qinit(&bmem[0], 2, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); si16 = align_sse_i16(&bmem[1], qi16, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); byte_mem_clear(&bmem[0]); fprintf(stderr, "split\n\n"); kswq_sse_t *qu8 = aln_sse_qinit(&bmem[0], 1, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); su8 = align_sse_u8(&bmem[1], qu8, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); byte_mem_clear(&bmem[0]); if (si16.score2 != su8.score2) { fprintf(stderr, "%d %d\n", si16.score2, su8.score2); // write_query_target_sequence(kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 0, 0); } } } #endif #if 1 int i = 0; for (; i < ARRAY_SIZE(kernel_prof_idx); ++i) { gdata[kernel_prof_idx[i]] = score_total[i]; } #ifdef SHOW_PERF fprintf(stderr, "[align sse i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]); fprintf(stderr, "[align sse u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]); fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]); fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]); #endif #endif // close_files(); fclose(qfp); fclose(tfp); fclose(ifp); return 0; }