diff --git a/.vscode/settings.json b/.vscode/settings.json index 8fa25b5..9fa5b3b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -23,6 +23,8 @@ "utility": "c", "byte_alloc.h": "c", "debug.h": "c", - "limits": "c" + "limits": "c", + "align.h": "c", + "extend.h": "c" } } \ No newline at end of file diff --git a/align.c b/align.c index e69de29..a723a77 100644 --- a/align.c +++ b/align.c @@ -0,0 +1,276 @@ +/********************************************************************************************* + Description: stripped sw align functions in bwa-mem + + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/11 +***********************************************************************************************/ +// u8和i16 score2 结果有不同,不是算法问题,而是边界未清零 +#include +#include + +#include "byte_alloc.h" +#include "utils.h" +#include "profiling.h" +#include "align.h" +#include "debug.h" + +const kswr_t g_defr = {0, -1, -1, -1, -1, -1, -1}; + +#define ALIGN_PERFORMANCE_TEST(kernel_id, nbyte, func, sp, ep) \ + do \ + { \ + PROF_START(align); \ + int i; \ + kswr_t score; \ + for (i = sp; i < ep; ++i) \ + { if (kv_A(kv_A(i_arr, i), 0) < 144) continue; \ + kswq_sse_t *q = aln_sse_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ + score = func(&bmem[1], q, \ + kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \ + 6, 1, 6, 1, xtra); \ + score_total[kernel_id] += score.score; \ + byte_mem_clear(&bmem[0]); /* free(q); */ \ + } \ + PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ + } while (0) + +#define ALIGN_PERFORMANCE_TEST_AVX2(kernel_id, nbyte, func, sp, ep) \ + do { \ + PROF_START(align); \ + int i; \ + kswr_t score; \ + for (i = sp; i < ep; ++i) { \ + /*if (kv_A(kv_A(i_arr, i), 0) < 144) \ + continue; */ \ + kswq_avx2_t *q = aln_avx2_qinit(&bmem[0], nbyte, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); \ + score = func(&bmem[1], q, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); \ + score_total[kernel_id] += score.score; \ + byte_mem_clear(&bmem[0]); /* free(q); */ \ + } \ + PROF_END(gprof[kernel_prof_idx[kernel_id]], align); \ + } while (0) + +// sse ksw init +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_sse_t *q; + int slen, a, tmp, p, memsize; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + qlen = 144; + slen = (qlen + p - 1) / p; // segmented length + memsize = sizeof(kswq_sse_t) + 256 + 16 * slen * (m + 4); + q = (kswq_sse_t *)malloc(memsize); // a single block of memory + //q = (kswq_sse_t *)byte_mem_request_and_clean(bmem, memsize); + //q = (kswq_sse_t *)byte_mem_request(bmem, memsize); + q->qp = (__m128i *)(((size_t)q + sizeof(kswq_sse_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + //fprintf(stderr, "%d %d\n", slen, qlen); + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet (ACGTN) + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { + kswq_avx2_t *q; + int slen, a, tmp, p; + size = size > 1 ? 2 : 1; + p = 16 * (3 - size); // # values per __m256i,i16是16个,u8是32个 + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_avx2_t *)malloc(sizeof(kswq_avx2_t) + 512 + 32 * slen * (m + 4)); // a single block of memory + q->qp = (__m256i *)(((size_t)q + sizeof(kswq_avx2_t) + 31) >> 5 << 5); // align memory,32字节对齐 + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; + q->qlen = qlen; + q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) + q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) + q->mdiff = mat[a]; + } + q->max = q->mdiff; // (=1) + q->shift = 256 - q->shift; // NB: q->shift is uint8_t // 主要用来处理uint8_t类型数据,保证score不小于0,i16不用 (=4) + q->mdiff += q->shift; // this is the difference between the min and max scores (=1) + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t *)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen ? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t *)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen ? 0 : ma[query[k]]); + } + } + return q; +} + +/****** + * 输入说明:三个文件,query.fa, target.fa, info.txt + * query.fa: 每一行代表一个query序列,由ACGTN组成 + * target.fa: 每一行代表一个reference(target)序列,由ACGTN组成 + * info.txt: 每一行由两个数字组成,分别代表query序列长度,target序列长度(align不需要初始分数) + */ +int main_align(int argc, char *argv[]) +{ + if (argc < 3) + { + fprintf(stderr, "Need 3 files: query, target, info.\n"); + return -1; + } + const char *qf_path = argv[0]; + const char *tf_path = argv[1]; + const char *if_path = argv[2]; + + FILE *qfp = fopen(qf_path, "r"); + FILE *tfp = fopen(tf_path, "r"); + FILE *ifp = fopen(if_path, "r"); + + buf_t read_buf = {0}; + seq_v q_arr = {0}; + seq_v t_arr = {0}; + qti_v i_arr = {0}; + uint64_t score_total[ALIGN_FUNC_NUM] = {0}; + const int kmax_row = 3000000; + + /* read input files */ + int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp); + int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp); + int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 2, ifp); + // fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row); + /* init parameters */ + int8_t mat[25] = {1, -4, -4, -4, -1, + -4, 1, -4, -4, -1, + -4, -4, 1, -4, -1, + -4, -4, -4, 1, -1, + -1, -1, -1, -1, -1}; + int xtra = 851987; + int kernel_prof_idx[] = {G_ALN_I16, G_ALN_U8, G_ALN_AVX2_I16, G_ALN_AVX2_U8}; + byte_mem_t bmem[2] = {{0}, {0}}; + byte_mem_init_alloc(&bmem[0], 1024*4); + byte_mem_init_alloc(&bmem[1], 1024*4); + int align_lines = MIN(MIN(query_read_row, target_read_row), info_read_row); + // open_qti_files(); + // open_debug_files(); + /* convert query sequence for sse or avx2 */ + + + fprintf(stderr, "excute nums: %d\n", align_lines); + ALIGN_PERFORMANCE_TEST(0, 2, align_sse_i16, 0, align_lines); + ALIGN_PERFORMANCE_TEST(1, 1, align_sse_u8, 0, align_lines); + ALIGN_PERFORMANCE_TEST_AVX2(2, 2, align_avx2_i16, 0, align_lines); + ALIGN_PERFORMANCE_TEST_AVX2(3, 1, align_avx2_u8, 0, align_lines); +#if 0 + // compare the score2 of i16 and u8 + { + int i; + kswr_t si16, su8; + for (i = 0; i < align_lines; ++i) + { + kswq_sse_t *qi16 = aln_sse_qinit(&bmem[0], 2, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); + si16 = align_sse_i16(&bmem[1], qi16, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); + byte_mem_clear(&bmem[0]); + fprintf(stderr, "split\n\n"); + kswq_sse_t *qu8 = aln_sse_qinit(&bmem[0], 1, kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, 5, mat); + su8 = align_sse_u8(&bmem[1], qu8, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 6, 1, 6, 1, xtra); + byte_mem_clear(&bmem[0]); + if (si16.score2 != su8.score2) { + fprintf(stderr, "%d %d\n", si16.score2, su8.score2); +// write_query_target_sequence(kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, 0, 0); + } + } + } +#endif +#if 1 + int i = 0; + for (; i < ARRAY_SIZE(kernel_prof_idx); ++i) + { + gdata[kernel_prof_idx[i]] = score_total[i]; + } + +#ifdef SHOW_PERF + fprintf(stderr, "[align sse i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_I16] / TIME_DIVIDE_BY, gdata[G_ALN_I16]); + fprintf(stderr, "[align sse u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_U8] / TIME_DIVIDE_BY, gdata[G_ALN_U8]); + fprintf(stderr, "[align avx2 i16] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_I16]); + fprintf(stderr, "[align avx2 u8 ] time: %9.6lf s; score: %ld\n", gprof[G_ALN_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_ALN_AVX2_U8]); +#endif +#endif + // close_files(); + fclose(qfp); + fclose(tfp); + fclose(ifp); + return 0; +} diff --git a/align.h b/align.h index e9be248..b20d467 100644 --- a/align.h +++ b/align.h @@ -1,15 +1,56 @@ /********************************************************************************************* - Description: common defines and declarations + Description: stripped sw align functions in bwa-mem Copyright : All right reserved by NCIC.ICT Author : Zhang Zhonghai - Date : 2023/08/26 + Date : 2024/04/11 ***********************************************************************************************/ -#ifndef __COMMON_H -#define __COMMON_H +#ifndef __ALIGN_H +#define __ALIGN_H #include +#include +#include +#include "byte_alloc.h" +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 +typedef struct +{ + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; +extern const kswr_t g_defr; + +typedef struct _kswq_t +{ + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +} kswq_sse_t; + +typedef struct _kswq_avx2_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m256i *qp, *H0, *H1, *E, *Hmax; +} kswq_avx2_t; + +#define ALIGN_FUNC_NUM 4 + +kswq_sse_t *aln_sse_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat); +kswr_t align_sse_i16(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra); // the first gap costs -(_o+_e) +kswr_t align_sse_u8(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra); +kswq_avx2_t *aln_avx2_qinit(byte_mem_t *bmem, int size, int qlen, const uint8_t *query, int m, const int8_t *mat); +kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, + int _e_ins, int xtra); +kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, + int _e_ins, int xtra); + +int main_align(int argc, char *argv[]); #endif \ No newline at end of file diff --git a/align_avx2_i16.c b/align_avx2_i16.c index e69de29..3715228 100644 --- a/align_avx2_i16.c +++ b/align_avx2_i16.c @@ -0,0 +1,137 @@ + +#include +#include + +#include "align.h" +#include "utils.h" + +kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, + int _e_ins, int xtra) { + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m256i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_16(ret, xx) \ + do { \ + int16_t *maxVal = (int16_t *)&(xx); \ + (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 8)); \ + (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 4)); \ + (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 2)); \ + (ret) = MAX(maxVal[0], maxVal[8]); \ + } while (0) + +// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000 +#define allzero_16(xx) (!_mm256_movemask_epi8((xx))) + +#define avx2_slli_i16(xx, imm) \ + do { \ + int16_t *arr = (int16_t *)&(xx); \ + int16_t val = arr[7]; \ + (xx) = _mm256_slli_si256((xx), (imm)); \ + arr[8] = val; \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000; + endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000; + m_b = n_b = 0; + b = 0; + zero = _mm256_set1_epi32(0); + oe_del = _mm256_set1_epi16(_o_del + _e_del); + e_del = _mm256_set1_epi16(_e_del); + oe_ins = _mm256_set1_epi16(_o_ins + _e_ins); + e_ins = _mm256_set1_epi16(_e_ins); + H0 = q->H0; + H1 = q->H1; + E = q->E; + Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm256_store_si256(E + i, zero); + _mm256_store_si256(H0 + i, zero); + _mm256_store_si256(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m256i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + avx2_slli_i16(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm256_adds_epi16(h, _mm256_load_si256(S++)); + e = _mm256_load_si256(E + j); + h = _mm256_max_epi16(h, e); + h = _mm256_max_epi16(h, f); + max = _mm256_max_epi16(max, h); + _mm256_store_si256(H1 + j, h); + e = _mm256_subs_epu16(e, e_del); + t = _mm256_subs_epu16(h, oe_del); + e = _mm256_max_epi16(e, t); + _mm256_store_si256(E + j, e); + f = _mm256_subs_epu16(f, e_ins); + t = _mm256_subs_epu16(h, oe_ins); + f = _mm256_max_epi16(f, t); + h = _mm256_load_si256(H0 + j); + } + + for (k = 0; LIKELY(k < 16); ++k) { + avx2_slli_i16(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm256_load_si256(H1 + j); + h = _mm256_max_epi16(h, f); + _mm256_store_si256(H1 + j, h); + h = _mm256_subs_epu16(h, oe_ins); + f = _mm256_subs_epu16(f, e_ins); + if (UNLIKELY(allzero_16(_mm256_cmpgt_epi16(f, h)))) + goto end_loop16; + } + } + end_loop16: + __max_16(imax, max); + if (imax >= minsc) { + if (n_b == 0 || (int32_t)b[n_b - 1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b ? m_b << 1 : 8; + b = (uint64_t *)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax << 32 | i; + } else if ((int)(b[n_b - 1] >> 32) < imax) + b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; + te = i; + for (j = 0; LIKELY(j < slen); ++j) _mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j)); + if (gmax >= endsc) + break; + } + S = H1; + H1 = H0; + H0 = S; + } + r.score = gmax; + r.te = te; + { + int max = -1, tmp, low, high, qlen = slen * 16; + uint16_t *t = (uint16_t *)Hmax; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) + max = *t, r.qe = i / 16 + i % 16 * slen; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) + r.qe = tmp; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; + high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2) + r.score2 = b[i] >> 32, r.te2 = e; + } + } + } + free(b); + return r; +} \ No newline at end of file diff --git a/align_avx2_u8.c b/align_avx2_u8.c index e69de29..179be83 100644 --- a/align_avx2_u8.c +++ b/align_avx2_u8.c @@ -0,0 +1,154 @@ + +#include +#include + +#include "align.h" +#include "utils.h" + +kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, + int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m256i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_32(ret, xx) \ + do { \ + uint8_t *maxVal = (uint8_t *)&(xx); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 8)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 4)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 2)); \ + (xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 1)); \ + (ret) = MAX(maxVal[0], maxVal[16]); \ + } while (0) + +// Given entries with arbitrary values, return whether they are all 0x00 +#define allzero_32(xx) (_mm256_movemask_epi8(_mm256_cmpeq_epi8((xx), zero)) == 0xffffffff) + +#define avx2_slli_wrap(xx, imm) \ + do { \ + uint8_t *arr = (uint8_t *)&(xx); \ + uint8_t val = arr[15]; \ + (xx) = _mm256_slli_si256((xx), (imm)); \ + arr[16] = val; \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000; + endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000; + m_b = n_b = 0; + b = 0; + zero = _mm256_set1_epi32(0); + oe_del = _mm256_set1_epi8(_o_del + _e_del); + e_del = _mm256_set1_epi8(_e_del); + oe_ins = _mm256_set1_epi8(_o_ins + _e_ins); + e_ins = _mm256_set1_epi8(_e_ins); + shift = _mm256_set1_epi8(q->shift); + H0 = q->H0; + H1 = q->H1; + E = q->E; + Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm256_store_si256(E + i, zero); + _mm256_store_si256(H0 + i, zero); + _mm256_store_si256(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m256i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + avx2_slli_wrap(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm256_adds_epu8(h, _mm256_load_si256(S + j)); + h = _mm256_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm256_load_si256(E + j); // e=E'(i,j) + h = _mm256_max_epu8(h, e); + h = _mm256_max_epu8(h, f); // h=H'(i,j) + max = _mm256_max_epu8(max, h); // set max + _mm256_store_si256(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + e = _mm256_subs_epu8(e, e_del); // e=E'(i,j) - e_del + t = _mm256_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + e = _mm256_max_epu8(e, t); // e=E'(i+1,j) + _mm256_store_si256(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm256_subs_epu8(f, e_ins); + t = _mm256_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + f = _mm256_max_epu8(f, t); + // get H'(i-1,j) and prepare for the next j + h = _mm256_load_si256(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 32); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + avx2_slli_wrap(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm256_load_si256(H1 + j); + h = _mm256_max_epu8(h, f); // h=H'(i,j) + _mm256_store_si256(H1 + j, h); + h = _mm256_subs_epu8(h, oe_ins); + f = _mm256_subs_epu8(f, e_ins); + if (UNLIKELY(allzero_32(_mm256_subs_epu8(f, h)))) + goto end_loop32; + } + } + end_loop32: + __max_32(imax, max); // imax is the maximum number in max + if (imax >= minsc) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || + (int32_t)b[n_b - 1] + 1 != + i) { // then append 新的max用新的数值记录,用来记录次优分值,不能与最优分值相邻(否则就是连续匹配的) + if (n_b == m_b) { + m_b = m_b ? m_b << 1 : 8; + b = (uint64_t *)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax << 32 | i; + } else if ((int)(b[n_b - 1] >> 32) < imax) + b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; + te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j)); + if (gmax + q->shift >= 255 || gmax >= endsc) + break; // 最大分值溢出了,结束运算 + } + S = H1; + H1 = H0; + H0 = S; // swap H0 and H1 + } + // fprintf(gf[0], "%d\n", i); + r.score = gmax + q->shift < 255 ? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score + int max = -1, tmp, low, high, qlen = slen * 32; + uint8_t *t = (uint8_t *)Hmax; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) + max = *t, r.qe = i / 32 + i % 32 * slen; + else if ((int)*t == max && (tmp = i / 32 + i % 32 * slen) < r.qe) + r.qe = tmp; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; + high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2) + r.score2 = b[i] >> 32, r.te2 = e; + } + } + } + free(b); + return r; +} diff --git a/align_sse_i16.c b/align_sse_i16.c index e69de29..b76c9a8 100644 --- a/align_sse_i16.c +++ b/align_sse_i16.c @@ -0,0 +1,134 @@ +/********************************************************************************************* + Description: sw align functions in bwa-mem + + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/11 +***********************************************************************************************/ +#include +#include +#include "utils.h" +#include "align.h" + +kswr_t align_sse_i16(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; + kswr_t r; + +#if defined __SSE2__ +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + +// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000 +#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx))) + +#elif defined __ARM_NEON +#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx))) +#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0) + +#else +#define __max_8(ret, xx) (ret) = m128i_max_s16((xx)) +#define allzero_0f_8(xx) (m128i_allzero((xx))) +#endif + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi16(_o_del + _e_del); + e_del = _mm_set1_epi16(_e_del); + oe_ins = _mm_set1_epi16(_o_ins + _e_ins); + e_ins = _mm_set1_epi16(_e_ins); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, _mm_load_si128(S++)); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + e = _mm_subs_epu16(e, e_del); + t = _mm_subs_epu16(h, oe_del); + e = _mm_max_epi16(e, t); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, e_ins); + t = _mm_subs_epu16(h, oe_ins); + f = _mm_max_epi16(f, t); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, oe_ins); + f = _mm_subs_epu16(f, e_ins); + if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + //fprintf(stderr, "%d\n", imax); + if (imax >= minsc) { + //fprintf(stderr, "%d\n", imax); + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + // b = (uint64_t *)byte_mem_request(bmem, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; + + } + r.score = gmax; r.te = te; + { + int max = -1, tmp, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + // byte_mem_clear(bmem); + return r; +} diff --git a/align_sse_u8.c b/align_sse_u8.c index e69de29..ce71dd7 100644 --- a/align_sse_u8.c +++ b/align_sse_u8.c @@ -0,0 +1,149 @@ +/********************************************************************************************* + Description: sw align functions in bwa-mem + + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2024/04/11 +***********************************************************************************************/ +#include +#include +#include "utils.h" +#include "align.h" + +kswr_t align_sse_u8(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; + kswr_t r; + +#if defined __SSE2__ +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + +// Given entries with arbitrary values, return whether they are all 0x00 +#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff) + +#elif defined __ARM_NEON +#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx)) +#define allzero_16(xx) (vmaxvq_u8((xx)) == 0) + +#else +#define __max_16(ret, xx) (ret) = m128i_max_u8((xx)) +#define allzero_16(xx) (m128i_allzero((xx))) +#endif + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi8(_o_del + _e_del); + e_del = _mm_set1_epi8(_e_del); + oe_ins = _mm_set1_epi8(_o_ins + _e_ins); + e_ins = _mm_set1_epi8(_e_ins); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del + t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + e = _mm_max_epu8(e, t); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, e_ins); + t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + f = _mm_max_epu8(f, t); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, oe_ins); + f = _mm_subs_epu8(f, e_ins); + if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + //fprintf(stderr, "%d\n", imax); + if (imax >= minsc) { // write the b array; this condition adds branching unfornately + //fprintf(stderr, "%d\n", imax); + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + //b = (uint64_t *)byte_mem_request(bmem, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255 || gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score + int max = -1, tmp, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; + //printf("%d,%d\n", max, gmax); + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + //byte_mem_clear(bmem); + return r; +} diff --git a/byte_alloc.c b/byte_alloc.c index ae9f1f1..179750b 100644 --- a/byte_alloc.c +++ b/byte_alloc.c @@ -39,31 +39,33 @@ void byte_mem_init_alloc(byte_mem_t *bmem, size_t byte_cnt) void *byte_mem_request(byte_mem_t *bmem, size_t byte_cnt) { // fprintf(stderr, "capacity:%ld, occupied: %ld, byte_cnt: %ld\n", bmem->capacity, bmem->occupied, byte_cnt); - void *ret_mem = 0; + //void *ret_mem = 0; if (bmem == 0) { - ret_mem = 0; + // ret_mem = 0; + return 0; } else if (bmem->capacity == 0) { bmem->capacity = byte_cnt; bmem->mem = malloc(bmem->capacity); bmem->occupied = byte_cnt; - ret_mem = bmem->mem; + //ret_mem = bmem->mem; } else if (bmem->capacity - bmem->occupied >= byte_cnt) { - ret_mem = bmem->mem + bmem->occupied; + //ret_mem = bmem->mem + bmem->occupied; bmem->occupied += byte_cnt; } else { bmem->capacity = bmem->occupied + byte_cnt; bmem->mem = realloc(bmem->mem, bmem->capacity); - ret_mem = bmem->mem + bmem->occupied; + //ret_mem = bmem->mem + bmem->occupied; bmem->occupied += byte_cnt; } - return ret_mem; + // return ret_mem; + return bmem->mem; } void *byte_mem_request_and_clean(byte_mem_t *bmem, size_t byte_cnt) diff --git a/extend.c b/extend.c index 3562b69..6820047 100644 --- a/extend.c +++ b/extend.c @@ -28,7 +28,7 @@ kv_A(kv_A(i_arr, i), 0), kv_A(q_arr, i).a, \ kv_A(kv_A(i_arr, i), 1), kv_A(t_arr, i).a, \ 5, mat, 6, 1, 6, 1, 100, 5, 100, \ - kv_A(kv_A(i_arr, i), 2), \ + /*kv_A(kv_A(i_arr, i), 2)*/10, \ &qle, &tle, >le, &gscore, &max_off[0]); \ score_total[kernel_id] += score; \ } \ @@ -60,10 +60,11 @@ int main_extend(int argc, char *argv[]) seq_v t_arr = {0}; qti_v i_arr = {0}; uint64_t score_total[EXTEND_FUNC_NUM] = {0}; + const int kmax_row = 3000000; - int query_read_row = read_seq(&q_arr, &read_buf, 3000000, qfp); - int target_read_row = read_seq(&t_arr, &read_buf, 3000000, tfp); - int info_read_row = read_qt_info(&i_arr, &read_buf, 3000000, 3, ifp); + int query_read_row = read_seq(&q_arr, &read_buf, kmax_row, qfp); + int target_read_row = read_seq(&t_arr, &read_buf, kmax_row, tfp); + int info_read_row = read_qt_info(&i_arr, &read_buf, kmax_row, 3, ifp); // fprintf(stderr, "read row: %d\t%d\t%d\n", query_read_row, target_read_row, info_read_row); int8_t mat[25] = {1, -4, -4, -4, -1, @@ -84,9 +85,14 @@ int main_extend(int argc, char *argv[]) EXTEND_PERFORMANCE_TEST(2, extend_avx2_u8, 0, excute_lines); EXTEND_PERFORMANCE_TEST(3, extend_avx2_i16_sp, 0, excute_lines); int i = 0; for(; i 0) - marr[mi] = MAX(marr[mi], hA2[midx - mi]); + if (midx - mii > 0) + marr[mii] = MAX(marr[mii], hA2[midx - mii]); } midx += 1; if (ibeg > icheck) { int stopCalc = 0; - for (mi = 0; mi < b - 1; ++mi) + for (mii = 0; mii < b - 1; ++mii) { - stopCalc |= !marr[mi]; + stopCalc |= !marr[mii]; } if (stopCalc) break; diff --git a/extend_scalar.c b/extend_scalar.c index 3813f2c..1848090 100644 --- a/extend_scalar.c +++ b/extend_scalar.c @@ -21,7 +21,7 @@ #define UNLIKELY(x) (x) #endif -#define PRUNING 1 +#define PRUNING 0 typedef struct { @@ -37,8 +37,11 @@ int extend_scalar(byte_mem_t *bmem, int qlen, const uint8_t *query, int tlen, co assert(h0 > 0); // qp = malloc(qlen * m); // eh = calloc(qlen + 1, 8); - qp = byte_mem_request(bmem, qlen * m); - eh = byte_mem_request_and_clean(bmem, (qlen + 1) * 8); + int qp_size = qlen * m; + int eh_size = (qlen + 1) * 8; + qp = byte_mem_request(bmem, qp_size + eh_size); + eh = (eh_t*)((uint8_t*)qp + qp_size); + memset(eh, 0, eh_size); // generate the query profile for (k = i = 0; k < m; ++k) { diff --git a/main.c b/main.c index 9b792b6..4405adb 100644 --- a/main.c +++ b/main.c @@ -15,19 +15,24 @@ #include #include "byte_alloc.h" #include "utils.h" -#include "extend.h" #include "profiling.h" +#include "extend.h" +#include "align.h" - -#define TEST_EXTEND +#define TEST_EXTEND 0 +#define TEST_ALIGN 1 // 程序执行入口 int main(int argc, char *argv[]) { -#ifdef TEST_EXTEND +#if TEST_EXTEND main_extend(argc - 1, argv + 1); #endif +#if TEST_ALIGN + main_align(argc - 1, argv + 1); +#endif + return 0; } diff --git a/profiling.c b/profiling.c index bcb79e0..4c4efd1 100644 --- a/profiling.c +++ b/profiling.c @@ -28,17 +28,4 @@ uint64_t get_msec() // gettimeofday(&tv, NULL); // return (uint64_t)1000 * (tv.tv_sec + ((1e-6) * tv.tv_usec)); return clock(); -} - -/* show excution time */ -int display_extend_stats() -{ -#ifdef SHOW_PERF - fprintf(stderr, "[extend scalar ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_SCALAR] / TIME_DIVIDE_BY, gdata[G_EXT_SCALAR]); - fprintf(stderr, "[extend avx i16] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16]); - fprintf(stderr, "[extend avx u8 ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_U8] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_U8]); - fprintf(stderr, "[extend avx sp ] time: %9.6lf s; score: %ld\n", gprof[G_EXT_AVX2_I16_SP] / TIME_DIVIDE_BY, gdata[G_EXT_AVX2_I16_SP]); -#endif - - return 0; } \ No newline at end of file diff --git a/profiling.h b/profiling.h index f8e5fe0..3bb55e8 100644 --- a/profiling.h +++ b/profiling.h @@ -8,6 +8,7 @@ #ifndef __PROFILING_H #define __PROFILING_H #include +#include #define TIME_DIVIDE_BY (CLOCKS_PER_SEC * 1.0) @@ -19,7 +20,7 @@ extern FILE *ins_f_arr[LIM_TYPE], *score_f_arr[LIM_TYPE], *retval_f_arr[LIM_TYPE]; -// GLOBAL +// GLOBAL performance info enum { G_ALL = 0, @@ -27,6 +28,10 @@ enum G_EXT_AVX2_I16, G_EXT_AVX2_U8, G_EXT_AVX2_I16_SP, + G_ALN_I16, + G_ALN_U8, + G_ALN_AVX2_I16, + G_ALN_AVX2_U8 }; #ifdef SHOW_PERF @@ -39,6 +44,4 @@ enum // get current milli seconds uint64_t get_msec(); -// show excution time -int display_extend_stats(); #endif \ No newline at end of file diff --git a/run.sh b/run.sh index efa5d4f..ea37374 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,12 @@ #!/bin/bash #./sw_perf ./input/diff_ext/q0.txt ./input/diff_ext/t0.txt ./input/diff_ext/i0.txt -./sw_perf ./input/ext/q_0.fa ./input/ext/t_0.fa ./input/ext/i_0.txt -./sw_perf ./input/ext/q_1.fa ./input/ext/t_1.fa ./input/ext/i_1.txt -./sw_perf ./input/ext/q_2.fa ./input/ext/t_2.fa ./input/ext/i_2.txt -./sw_perf ./input/ext/q_3.fa ./input/ext/t_3.fa ./input/ext/i_3.txt +#./sw_perf ./input/ext/q_0.fa ./input/ext/t_0.fa ./input/ext/i_0.txt +#./sw_perf ./input/ext/q_1.fa ./input/ext/t_1.fa ./input/ext/i_1.txt +#./sw_perf ./input/ext/q_2.fa ./input/ext/t_2.fa ./input/ext/i_2.txt +#./sw_perf ./input/ext/q_3.fa ./input/ext/t_3.fa ./input/ext/i_3.txt + +#./sw_perf ./input/align/q_0.fa ./input/align/t_0.fa ./input/align/i_0.txt +./sw_perf ./input/align/q_0.fa ./input/align/t_0.fa ./input/align/i_0_ext.txt + +# debug +#./sw_perf ./input/diff_align/q0.txt ./input/diff_align/t0.txt ./input/diff_align/i0.txt diff --git a/utils.c b/utils.c index a5cdf25..4987af0 100644 --- a/utils.c +++ b/utils.c @@ -12,6 +12,7 @@ #include #include "utils.h" +#include "debug.h" unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -112,4 +113,29 @@ int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp) ++read_row_num; } return read_row_num; +} + +void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum) +{ +#ifdef DEBUG_FILE_OUTPUT + // 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen + FILE *query_f = gfq[fnum], + *target_f = gft[fnum], + *info_f = gfi[fnum]; + const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'}; + int i; + // 处理query + for (i = 0; i < qlen; ++i) + fprintf(query_f, "%c", seq_map[query[i]]); + fprintf(query_f, "\n"); + // 处理target + for (i = 0; i < tlen; ++i) + fprintf(target_f, "%c", seq_map[target[i]]); + fprintf(target_f, "\n"); + // 处理其他信息 + if (h0 > 0) + fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0); + else + fprintf(info_f, "%-8d%-8d\n", qlen, tlen); +#endif } \ No newline at end of file diff --git a/utils.h b/utils.h index 2f24850..afdd38a 100644 --- a/utils.h +++ b/utils.h @@ -10,6 +10,7 @@ #define __UTILS_H #include +#include #include #include "kvec.h" @@ -18,6 +19,14 @@ #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #define BUF_SIZE 1048576 @@ -42,4 +51,6 @@ int read_qt_info(qti_v *qti_arr, buf_t *buf, int row_num_to_read, int info_num, int read_seq(seq_v *seq_arr, buf_t *buf, int row_num_to_read, FILE *fp); +void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum); + #endif \ No newline at end of file