From d16f8487ce832eba813e4c97c75d82ed56d5bf7a Mon Sep 17 00:00:00 2001 From: zzh Date: Mon, 8 Apr 2024 20:00:48 +0800 Subject: [PATCH] =?UTF-8?q?sc=E6=B6=88=E8=9E=8D=E5=AE=9E=E9=AA=8C=E7=89=88?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/launch.json | 6 +- Makefile | 18 +- ksw_ext_avx2_u8.c => avx2_u8.c | 257 ++++++---- ..._avx2_u8_heuristics.c => avx2_u8_pruning.c | 310 +++++++----- bsw.h | 20 + common.h | 2 +- ksw_ext.h | 164 ------ ksw_ext_avx2.c | 473 ------------------ ksw_ext_avx2_aligned.c | 445 ---------------- ksw_ext_avx2_heuristics.c | 431 ---------------- ksw_ext_avx2_u8_aligned.c | 459 ----------------- ksw_ext_cuda.c | 1 - ksw_ext_normal.c | 190 ------- main.c | 129 +---- normal.c | 162 ++++++ normal_pruning.c | 160 ++++++ run.sh | 5 + run_all.sh | 2 - run_debug.sh | 4 - run_l.sh | 2 - run_m.sh | 2 - run_s.sh | 2 - 23 files changed, 734 insertions(+), 2511 deletions(-) rename ksw_ext_avx2_u8.c => avx2_u8.c (76%) rename ksw_ext_avx2_u8_heuristics.c => avx2_u8_pruning.c (68%) create mode 100644 bsw.h delete mode 100644 ksw_ext.h delete mode 100644 ksw_ext_avx2.c delete mode 100644 ksw_ext_avx2_aligned.c delete mode 100644 ksw_ext_avx2_heuristics.c delete mode 100644 ksw_ext_avx2_u8_aligned.c delete mode 100644 ksw_ext_cuda.c delete mode 100644 ksw_ext_normal.c create mode 100644 normal.c create mode 100644 normal_pruning.c create mode 100755 run.sh delete mode 100755 run_all.sh delete mode 100755 run_debug.sh delete mode 100755 run_l.sh delete mode 100755 run_m.sh delete mode 100755 run_s.sh diff --git a/.gitignore b/.gitignore index 747183d..3a4dbe0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.sam output/ input/ *.[oa] diff --git a/.vscode/launch.json b/.vscode/launch.json index dc09bb2..d037877 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,9 +11,9 @@ "request": "launch", "program": "${workspaceRoot}/sw_perf", "args": [ - "/home/zzh/data/sw/q_s.fa", - "/home/zzh/data/sw/t_s.fa", - "/home/zzh/data/sw/i_s.txt" + "/home/zzh/sw/small/q_s.fa", + "/home/zzh/sw/small/t_s.fa", + "/home/zzh/sw/small/i_s.txt" ], "cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间 }, diff --git a/Makefile b/Makefile index a41c8fa..e0dd27a 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,17 @@ CC= gcc #CFLAGS= -g -Wall -Wno-unused-function -mavx2 -CFLAGS= -Wall -Wno-unused-function -O2 -mavx2 -DFLAGS= -DSHOW_PERF -DDEBUG_RETURN_VALUE +CFLAGS= -Wall -Wno-unused-function -mavx2 -g -O2 +DFLAGS= -DSHOW_PERF #-DDEBUG_RETURN_VALUE #DFLAGS= -DSHOW_PERF -DDEBUG_OUT -DDEBUG_RETURN_VALUE PROG= sw_perf PROG2= get_line INCLUDES= LIBS= SUBDIRS= . -OBJS= ksw_ext_normal.o \ - ksw_ext_avx2.o \ - ksw_ext_avx2_u8.o \ - ksw_ext_cuda.o \ - ksw_ext_avx2_heuristics.o \ - ksw_ext_avx2_u8_heuristics.o \ - ksw_ext_avx2_aligned.o \ - ksw_ext_avx2_u8_aligned.o \ +OBJS= normal.o \ + normal_pruning.o \ + avx2_u8.o \ + avx2_u8_pruning.o \ thread_mem.o \ utils.o @@ -43,4 +39,4 @@ clean: depend: ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c ) -# DO NOT DELETE THIS LINE -- make depend depends on it. \ No newline at end of file +# DO NOT DELETE THIS LINE -- make depend depends on it. diff --git a/ksw_ext_avx2_u8.c b/avx2_u8.c similarity index 76% rename from ksw_ext_avx2_u8.c rename to avx2_u8.c index 2a3ba43..a299103 100644 --- a/ksw_ext_avx2_u8.c +++ b/avx2_u8.c @@ -16,12 +16,20 @@ #define UNLIKELY(x) (x) #endif +#define KSW_EQUAL + #undef MAX #undef MIN #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 32 +typedef struct +{ + size_t m; + uint8_t *addr; +} buf_t; + static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -58,10 +66,9 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { // static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - -// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -// 初始化变量 +// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); +// 初始化变量 #define SIMD_INIT \ int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ __m256i zero_vec; \ @@ -77,8 +84,8 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, oe_ins_vec = _mm256_set1_epi8(oe_ins); \ e_del_vec = _mm256_set1_epi8(e_del); \ e_ins_vec = _mm256_set1_epi8(e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi8(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi8(base_mis_score); \ + __m256i match_sc_vec = _mm256_set1_epi8(a); \ + __m256i mis_sc_vec = _mm256_set1_epi8(b); \ __m256i amb_sc_vec = _mm256_set1_epi8(1); \ __m256i amb_vec = _mm256_set1_epi8(4); \ reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \ @@ -163,8 +170,8 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - m = maxVal[0]; \ - if (m > 0) \ + m = MAX(m, maxVal[0]); \ + if (maxVal[0] > 0) \ { \ for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ { \ @@ -196,32 +203,35 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, mA1 = mA2; \ mA2 = tmp; -int ksw_extend_avx2_u8(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 +int avx2_u8(thread_mem_t *tmem, + int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值) { uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M uint8_t *seq, *ref; uint8_t *mem, *qtmem, *vmem; + int is_left = 0; // 是不是向左扩展 + int a = 1; // 碱基match时的分数 + int b = 4; // 碱基mismatch时的惩罚分数(正数) int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; @@ -230,27 +240,35 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, SIMD_INIT; // 初始化simd用的数据 - assert(init_score > 0); + // fprintf(stdout, "%d\n", h0); + + assert(h0 > 0); // allocate memory - // mem = malloc(mem_size); - mem = thread_mem_request(tmem, mem_size); + mem = malloc(mem_size); + //if (buf->m < mem_size) + //{ + // buf->m = mem_size; + // buf->addr = realloc(buf->addr, mem_size); + //} + //mem = buf->addr; + qtmem = &mem[0]; seq = (uint8_t *)&qtmem[0]; ref = (uint8_t *)&qtmem[seq_size]; - if (extend_left) + if (is_left) { for (i = 0; i < qlen; ++i) seq[i] = query[qlen - 1 - i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; + ref[i + SIMD_WIDTH] = target[tlen - 1 - i]; } else { for (i = 0; i < qlen; ++i) seq[i] = query[i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[i]; + ref[i + SIMD_WIDTH] = target[i]; } vmem = &ref[ref_size]; @@ -274,73 +292,90 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, fA1 = &fA[0]; fA2 = &fA[col_size]; - // adjust $window_size if it is too large + // adjust $w if it is too large + k = m * m; // get the max score - max = base_match_score; + for (i = 0, max = 0; i < k; ++i) + max = max > mat[i] ? max : mat[i]; max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; + w = w < max_ins ? w : max_ins; max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? + w = w < max_del ? w : max_del; // TODO: is this necessary? if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); + w = MIN(tlen - 1, w); // DP loop - max = init_score, max_i = max_j = -1; + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; - + ; max_off = 0; beg = 1; end = qlen; - // init init_score - hA0[0] = init_score; // 左上角 - fA1[1] = MAX(0, init_score - (o_ins + e_ins)); - eA2[0] = init_score; - hA1[1] = fA1[1]; + // init h0 + hA0[0] = h0; // 左上角 + if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 - if (window_size >= qlen) + if (w >= qlen) { max_ie = 0; gscore = 0; } + int m_last = 0; + int iend; +#ifdef KSW_EQUAL + int midx = 1, icheck = 0, checkspecial = 1; + int m3 = 0, m2 = 0, m1 = 0; +#endif for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 - if (D < tlen) - beg1 = 1; + if (D > tlen) + { + span = MIN(Dloop - D, w); + beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1); + } else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - // beg1 = MAX(D - window_size, beg1); - // end1 = MIN(D + window_size, end1); - // beg = MAX(beg1, beg); - // end = MIN(end1, end); - // if (beg > end) - // break; + { + span = MIN(D - 1, w); + beg1 = MAX(1, ((D - w) / 2) + 1); + } + end1 = MIN(qlen, beg1 + span); - beg = beg1; - end = end1; + if (beg < beg1) + beg = beg1; + if (end > end1) + end = end1; + if (beg > end) + break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug - iend = D - beg; // ref开始计算的位置,倒序 + iend = D - (beg - 1); // ref开始计算的位置,倒序 span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 + ibeg = iend - span - 1; // 0开始的ref索引位置 // 每一轮需要记录的数据 int m = 0, mj = -1, mi = -1; max_vec = zero_vec; - // 处理左边界 + // 要处理边界 + // 左边界 处理f (insert) + if (ibeg == 0) + { + hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); + m = hA1[end]; + } + // 上边界 if (beg == 1) { - hA0[0] = eA2[0]; - mA1[0] = 0; - eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); + hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); + } + else if (D & 1) + { + hA1[beg - 1] = 0; + hA2[beg - 1] = 0; } for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) @@ -368,16 +403,37 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, // 存储结果 SIMD_STORE; } - // 处理上边界 - if (ibeg == 0) - { - fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - hA2[end + 1] = fA2[end + 1]; - mA2[end + 1] = 0; - } SIMD_FIND_MAX; +#ifdef KSW_EQUAL + if (hA1[0] < 4 && checkspecial) + { // b == 4 + if (hA1[0] == 3) + { + icheck = iend + 1; + } + else if (midx == 2) + { + m2 = MAX(m2, hA2[midx - 1]); + } + else + { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + } +#endif + // 注意最后跳出循环j的值 j = end + 1; @@ -386,36 +442,59 @@ int ksw_extend_avx2_u8(thread_mem_t *tmem, max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } + //if (m == 0 && m_last == 0) + // break; // 一定要注意,斜对角遍历和按列遍历的不同点 if (m > max) { max = m, max_i = mi, max_j = mj; max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); } + else if (m == max && max_i >= mi && mj > max_j) + { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } + else if (zdrop > 0) + { +#if 0 + if (mi - max_i > mj - max_j) + { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) + break; + } + else + { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) + break; + } +#endif + } // 调整计算的边界 // for (j = beg; LIKELY(j <= end); ++j) - //{ - // int has_val = hA1[j - 1] | hA2[j]; - // if (has_val) - // break; - //} + // { + // int has_val = hA1[j - 1] | hA2[j]; + // if (has_val) + // break; + // } // beg = j; // for (j = end + 1; LIKELY(j >= beg); --j) - //{ - // int has_val = hA1[j - 1] | hA2[j]; - // if (has_val) - // break; - // else - // hA0[j - 1] = 0; - //} + // { + // int has_val = hA1[j - 1] | hA2[j]; + // if (has_val) + // break; + // else + // hA0[j - 1] = 0; + // } // end = j + 1 <= qlen ? j + 1 : qlen; + beg = 1; end = qlen; + m_last = m; // swap m, h, e, f SWAP_DATA_POINTER; } - // free(mem); - thread_mem_release(tmem, mem_size); + free(mem); if (_qle) *_qle = max_j + 1; if (_tle) diff --git a/ksw_ext_avx2_u8_heuristics.c b/avx2_u8_pruning.c similarity index 68% rename from ksw_ext_avx2_u8_heuristics.c rename to avx2_u8_pruning.c index 4a85d3d..d020d88 100644 --- a/ksw_ext_avx2_u8_heuristics.c +++ b/avx2_u8_pruning.c @@ -16,12 +16,20 @@ #define UNLIKELY(x) (x) #endif +#define KSW_EQUAL + #undef MAX #undef MIN #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 32 +typedef struct +{ + size_t m; + uint8_t *addr; +} buf_t; + static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -58,15 +66,13 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { // static const uint8_t reverse_mask[SIMD_WIDTH] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - +#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) // const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); -// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -#define permute_mask 27 -// 初始化变量 +// 初始化变量 #define SIMD_INIT \ int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ __m256i zero_vec; \ - __m256i max_vec, last_max_vec = _mm256_set1_epi8(init_score); \ + __m256i max_vec; \ __m256i oe_del_vec; \ __m256i oe_ins_vec; \ __m256i e_del_vec; \ @@ -78,8 +84,8 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, oe_ins_vec = _mm256_set1_epi8(oe_ins); \ e_del_vec = _mm256_set1_epi8(e_del); \ e_ins_vec = _mm256_set1_epi8(e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi8(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi8(base_mis_score); \ + __m256i match_sc_vec = _mm256_set1_epi8(a); \ + __m256i mis_sc_vec = _mm256_set1_epi8(b); \ __m256i amb_sc_vec = _mm256_set1_epi8(1); \ __m256i amb_vec = _mm256_set1_epi8(4); \ reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \ @@ -152,51 +158,33 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); -// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max); -// last_max_vec = _mm256_set1_epi8(m); // 找最大值和位置 -#define SIMD_FIND_MAX \ - __m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec); \ - cmp_max = _mm256_xor_si256(cmp_max, last_max_vec); \ - cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec); \ - uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ - if (cmp_result != 4294967295) \ - { \ - uint8_t *maxVal = (uint8_t *)&max_vec; \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - m = maxVal[0]; \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ - { \ - if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \ - { \ - m += base_match_score; \ - } \ - else \ - { \ - break; \ - } \ - } \ - } \ - } \ - last_max_vec = _mm256_set1_epi8(m); \ +#define SIMD_FIND_MAX \ + uint8_t *maxVal = (uint8_t *)&max_vec; \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ + m = MAX(m, maxVal[0]); \ + if (maxVal[0] > 0) \ + { \ + for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ + { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) \ + { \ + int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + } \ + } \ } // 每轮迭代后,交换数组 @@ -215,64 +203,71 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, mA1 = mA2; \ mA2 = tmp; -// uint8_t mem1[102400]; - -int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 +int avx2_u8_pruning(thread_mem_t *tmem, + int qlen, // query length 待匹配段碱基的query长度 + const uint8_t *query, // read碱基序列 + int tlen, // target length reference的长度 + const uint8_t *target, // reference序列 + int m, // 碱基种类 (5) + const int8_t *mat, // 每个位置的query和target的匹配得分 m*m + int o_del, // deletion 错配开始的惩罚系数 + int e_del, // deletion extension的惩罚系数 + int o_ins, // insertion 错配开始的惩罚系数 + int e_ins, // insertion extension的惩罚系数 + int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 + int end_bonus, + int zdrop, + int h0, // 该seed的初始得分(完全匹配query的碱基数) + int *_qle, // 匹配得到全局最大得分的碱基在query的位置 + int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 + int *_gtle, // query全部匹配上的target的长度 + int *_gscore, // query的端到端匹配得分 + int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值) { - // return init_score; uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M uint8_t *seq, *ref; uint8_t *mem, *qtmem, *vmem; int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; int Dloop = tlen + qlen; // 循环跳出条件 int span, beg1, end1; // 边界条件计算 int col_size = qlen + 2 + SIMD_WIDTH; int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍 int mem_size = seq_size + ref_size + val_mem_size; + int is_left = 0; // 是不是向左扩展 + int a = 1; // 碱基match时的分数 + int b = 4; // 碱基mismatch时的惩罚分数(正数) + SIMD_INIT; // 初始化simd用的数据 - assert(init_score > 0); + assert(h0 > 0); // allocate memory - // mem = malloc(mem_size); - mem = thread_mem_request(tmem, mem_size); + mem = malloc(mem_size); + // if (buf->m < mem_size) + //{ + // buf->m = mem_size; + // buf->addr = realloc(buf->addr, mem_size); + // } + // mem = buf->addr; + qtmem = &mem[0]; seq = (uint8_t *)&qtmem[0]; ref = (uint8_t *)&qtmem[seq_size]; - if (extend_left) + if (is_left) { for (i = 0; i < qlen; ++i) seq[i] = query[qlen - 1 - i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; + ref[i + SIMD_WIDTH] = target[tlen - 1 - i]; } else { for (i = 0; i < qlen; ++i) seq[i] = query[i]; for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[i]; + ref[i + SIMD_WIDTH] = target[i]; } vmem = &ref[ref_size]; @@ -296,74 +291,90 @@ int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, fA1 = &fA[0]; fA2 = &fA[col_size]; - // adjust $window_size if it is too large + // adjust $w if it is too large + k = m * m; // get the max score - max = base_match_score; + for (i = 0, max = 0; i < k; ++i) + max = max > mat[i] ? max : mat[i]; max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; + w = w < max_ins ? w : max_ins; max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? + w = w < max_del ? w : max_del; // TODO: is this necessary? if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); + w = MIN(tlen - 1, w); // DP loop - max = init_score, max_i = max_j = -1; + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; - + ; max_off = 0; beg = 1; end = qlen; - // init init_score - hA0[0] = init_score; // 左上角 - fA1[1] = MAX(0, init_score - (o_ins + e_ins)); - eA2[0] = init_score; - hA1[1] = fA1[1]; + // init h0 + hA0[0] = h0; // 左上角 if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况 - if (window_size >= qlen) + if (w >= qlen) { max_ie = 0; gscore = 0; } + + int m_last = 0; + int iend; +#ifdef KSW_EQUAL + int midx = 1, icheck = 0, checkspecial = 1; + int m3 = 0, m2 = 0, m1 = 0; +#endif for (D = 1; LIKELY(D < Dloop); ++D) { // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 - if (D < tlen) - beg1 = 1; + if (D > tlen) + { + span = MIN(Dloop - D, w); + beg1 = MAX(D - tlen + 1, ((D - w) / 2) + 1); + } else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - beg1 = MAX(D - window_size, beg1); - end1 = MIN(D + window_size, end1); + { + span = MIN(D - 1, w); + beg1 = MAX(1, ((D - w) / 2) + 1); + } + end1 = MIN(qlen, beg1 + span); - beg = MAX(beg1, beg); - end = MIN(end1, end); + if (beg < beg1) + beg = beg1; + if (end > end1) + end = end1; if (beg > end) - break; + break; // 不用计算了,直接跳出,否则hA2没有被赋值,里边是上一轮hA0的值,会出bug - // beg = beg1; - // end = end1; - - iend = D - beg; // ref开始计算的位置,倒序 + iend = D - (beg - 1); // ref开始计算的位置,倒序 span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 + ibeg = iend - span - 1; // 0开始的ref索引位置 // 每一轮需要记录的数据 int m = 0, mj = -1, mi = -1; max_vec = zero_vec; - // 处理左边界 + // 要处理边界 + // 左边界 处理f (insert) + if (ibeg == 0) + { + hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); + m = hA1[end]; + } + // 上边界 if (beg == 1) { - hA0[0] = eA2[0]; - mA1[0] = 0; - eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); + hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); + } + else if (D & 1) + { + hA1[beg - 1] = 0; + hA2[beg - 1] = 0; } for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) @@ -391,16 +402,37 @@ int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, // 存储结果 SIMD_STORE; } - // 处理上边界 - if (ibeg == 0) - { - fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - hA2[end + 1] = fA2[end + 1]; - mA2[end + 1] = 0; - } SIMD_FIND_MAX; +#ifdef KSW_EQUAL + if (hA1[0] < 4 && checkspecial) + { // b == 4 + if (hA1[0] == 3) + { + icheck = iend + 1; + } + else if (midx == 2) + { + m2 = MAX(m2, hA2[midx - 1]); + } + else + { + m2 = MAX(m2, hA2[midx - 1]); + m1 = MAX(m1, hA2[midx - 2]); + } + m3 = MAX(m3, hA2[midx]); + midx += 1; + if (ibeg > icheck) + { + if (!m1 || !m2 || !m3) + break; + else + checkspecial = 0; + } + } +#endif + // 注意最后跳出循环j的值 j = end + 1; @@ -409,12 +441,33 @@ int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, max_ie = gscore > hA2[qlen] ? max_ie : ibeg; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } - + if (m == 0 && m_last == 0) + break; // 一定要注意,斜对角遍历和按列遍历的不同点 if (m > max) { max = m, max_i = mi, max_j = mj; max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); } + else if (m == max && max_i >= mi && mj > max_j) + { + max_i = mi, max_j = mj; + max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + } + else if (zdrop > 0) + { +#if 0 + if (mi - max_i > mj - max_j) + { + if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) + break; + } + else + { + if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) + break; + } +#endif + } // 调整计算的边界 for (j = beg; LIKELY(j <= end); ++j) @@ -429,20 +482,17 @@ int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, int has_val = hA1[j - 1] | hA2[j]; if (has_val) break; + else + hA0[j - 1] = 0; } end = j + 1 <= qlen ? j + 1 : qlen; + m_last = m; // swap m, h, e, f SWAP_DATA_POINTER; -#ifdef SHOW_PERF -// time_bsw_adjust_bound += get_mseconds() - start_time; -// start_time = get_mseconds(); -// time_compare += get_mseconds() - start_time; -#endif } - // free(mem); - thread_mem_release(tmem, mem_size); + free(mem); if (_qle) *_qle = max_j + 1; if (_tle) diff --git a/bsw.h b/bsw.h new file mode 100644 index 0000000..7582c6c --- /dev/null +++ b/bsw.h @@ -0,0 +1,20 @@ +/********************************************************************************************* + Description: Declarations of sw extend functions + + Copyright : All right reserved by NCIC.ICT + + Author : Zhang Zhonghai + Date : 2023/08/23 +***********************************************************************************************/ +#ifndef __BSW_H +#define __BSW_H +#include + +typedef struct _thread_mem_t thread_mem_t; + +int normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); +int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); +int avx2_u8(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); +int avx2_u8_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off); + +#endif \ No newline at end of file diff --git a/common.h b/common.h index b8a39b6..bea5aa6 100644 --- a/common.h +++ b/common.h @@ -15,7 +15,7 @@ #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) < (y) ? (x) : (y)) -#define KERNEL_NUM 7 +#define KERNEL_NUM 4 extern FILE *ins_ext_f_arr[KERNEL_NUM], *del_ext_f_arr[KERNEL_NUM], diff --git a/ksw_ext.h b/ksw_ext.h deleted file mode 100644 index 9f931d8..0000000 --- a/ksw_ext.h +++ /dev/null @@ -1,164 +0,0 @@ -/********************************************************************************************* - Description: Declarations of sw extend functions - - Copyright : All right reserved by NCIC.ICT - - Author : Zhang Zhonghai - Date : 2023/08/23 -***********************************************************************************************/ -#ifndef __KSW_EXT_H -#define __KSW_EXT_H -#include - -typedef struct _thread_mem_t thread_mem_t; - -// declaration of ksw functions - -int ksw_extend_normal(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int m, // 碱基种类 (5) - const int8_t *mat, // 每个位置的query和target的匹配得分 m*m - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int w, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int zdrop, // 如果比对过程中,太多mismatch,提前结束比对 - int h0, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2_u8(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2_heuristics(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2_u8_heuristics(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2_aligned(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -int ksw_extend_avx2_u8_aligned(thread_mem_t *tmem, // 内存池 - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off); // 取得最大得分时在query和reference上位置差的 最大值 - -#endif \ No newline at end of file diff --git a/ksw_ext_avx2.c b/ksw_ext_avx2.c deleted file mode 100644 index a35cff8..0000000 --- a/ksw_ext_avx2.c +++ /dev/null @@ -1,473 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "thread_mem.h" -#include "common.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -#define SIMD_WIDTH 16 - -static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { - {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}; - -// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -#define permute_mask 27 -// 初始化变量 -#define SIMD_INIT \ - int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ - __m256i zero_vec; \ - __m256i max_vec; \ - __m256i oe_del_vec; \ - __m256i oe_ins_vec; \ - __m256i e_del_vec; \ - __m256i e_ins_vec; \ - __m256i h_vec_mask[SIMD_WIDTH]; \ - zero_vec = _mm256_setzero_si256(); \ - oe_del_vec = _mm256_set1_epi16(-oe_del); \ - oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ - e_del_vec = _mm256_set1_epi16(-e_del); \ - e_ins_vec = _mm256_set1_epi16(-e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi16(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi16(-base_mis_score); \ - __m256i amb_sc_vec = _mm256_set1_epi16(-1); \ - __m256i amb_vec = _mm256_set1_epi16(4); \ - for (i = 0; i < SIMD_WIDTH; ++i) \ - h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); - -/* - * e 表示当前ref的碱基被删除 - * f 表示当前seq的碱基插入 - * m 表示当前碱基匹配(可以相等,也可以不想等) - * h 表示最大值 - */ -// load向量化数据 -#define SIMD_LOAD \ - __m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \ - __m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \ - __m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \ - __m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \ - __m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \ - __m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j - 1])); \ - __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i])); - -// 比对ref和seq的序列,计算罚分 -#define SIMD_CMP_SEQ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ - __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \ - __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \ - __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \ - score_vec = _mm256_or_si256(score_vec, mis_score_vec); \ - __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \ - __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \ - __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ - score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \ - __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ - score_vec = _mm256_or_si256(score_vec, amb_score_vec); - -// 向量化计算h, e, f, m -#define SIMD_COMPUTE \ - __m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \ - __m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \ - __m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \ - __m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \ - __m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \ - __m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \ - __m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \ - __m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \ - __m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \ - __m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \ - __m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \ - en_vec = _mm256_max_epi16(en_vec, zero_vec); \ - fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \ - mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \ - hn_vec = _mm256_max_epi16(hn_vec, zero_vec); - -/* int16_t *t_ptr = (int16_t *)&ts_vec; \ - fprintf(stderr, "D: %d, ibeg: %d, iend: %d, jbeg: %d, jend: %d, %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n", \ - D, ibeg, iend, beg, end, \ - t_ptr[0], t_ptr[1], t_ptr[2], t_ptr[3], \ - t_ptr[4], t_ptr[5], t_ptr[6], t_ptr[7], \ - t_ptr[8], t_ptr[9], t_ptr[10], t_ptr[11], \ - t_ptr[12], t_ptr[13], t_ptr[14], t_ptr[15]); -*/ - -// 存储向量化结果 -#define SIMD_STORE \ - max_vec = _mm256_max_epi16(max_vec, hn_vec); \ - _mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \ - _mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \ - _mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \ - _mm256_storeu_si256((__m256i *)&hA2[j], hn_vec); - -// 去除多余的部分 -#define SIMD_REMOVE_EXTRA \ - en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \ - fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \ - mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ - hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); - -// 找最大值和位置 -#define SIMD_FIND_MAX \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - int16_t *maxVal = (int16_t *)&max_vec; \ - m = maxVal[0]; \ - if (m > 0) \ - { \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - } \ - } \ - } - -// 每轮迭代后,交换数组 -#define SWAP_DATA_POINTER \ - int16_t *tmp = hA0; \ - hA0 = hA1; \ - hA1 = hA2; \ - hA2 = tmp; \ - tmp = eA1; \ - eA1 = eA2; \ - eA2 = tmp; \ - tmp = fA1; \ - fA1 = fA2; \ - fA2 = tmp; \ - tmp = mA1; \ - mA1 = mA2; \ - mA2 = tmp; - -int ksw_extend_avx2(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 -{ - int16_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M - int16_t *seq, *ref; - uint8_t *mem; - int16_t *qtmem, *vmem; - int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - int Dloop = tlen + qlen; // 循环跳出条件 - int span, beg1, end1; // 边界条件计算 - int col_size = qlen + 2 + SIMD_WIDTH; - int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 - int mem_size = (seq_size + ref_size) * 2 + val_mem_size; - - SIMD_INIT; // 初始化simd用的数据 - - assert(init_score > 0); - - // allocate memory - mem = thread_mem_request(tmem, mem_size); - qtmem = (int16_t *)&mem[0]; - seq = &qtmem[0]; - ref = &qtmem[seq_size]; - if (extend_left) - { - for (i = 0; i < qlen; ++i) - seq[i] = query[qlen - 1 - i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; - } - else - { - for (i = 0; i < qlen; ++i) - seq[i] = query[i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[i]; - } - - vmem = &ref[ref_size]; - for (i = 0; i < (val_mem_size >> 1); i += SIMD_WIDTH) - { - _mm256_storeu_si256((__m256i *)&vmem[i], zero_vec); - } - hA = &vmem[0]; - mA = &vmem[col_size * 3]; - eA = &vmem[col_size * 5]; - fA = &vmem[col_size * 7]; - - hA0 = &hA[0]; - hA1 = &hA[col_size]; - hA2 = &hA1[col_size]; - mA1 = &mA[0]; - mA2 = &mA[col_size]; - eA1 = &eA[0]; - eA2 = &eA[col_size]; - fA1 = &fA[0]; - fA2 = &fA[col_size]; - - // adjust $window_size if it is too large - // get the max score - max = base_match_score; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); - max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? - if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); - - // DP loop - max = init_score, max_i = max_j = -1; - max_ie = -1, gscore = -1; - - max_off = 0; - beg = 1; - end = qlen; - // init init_score - hA0[0] = init_score; // 左上角 - fA1[1] = MAX(0, init_score - (o_ins + e_ins)); - eA2[0] = init_score; - hA1[1] = fA1[1]; - if (qlen == 0 || tlen == 0) - Dloop = 0; // 防止意外情况 - if (window_size >= qlen) - { - max_ie = 0; - gscore = 0; - } -// fprintf(stderr, "qlen:%d, tlen:%d\n", qlen, tlen); -#ifdef DEBUG_OUT - int dii, djj; - int16_t ins[tlen + 1][qlen + 2]; - int16_t del[tlen + 1][qlen + 2]; - int16_t score[tlen + 1][qlen + 2]; - for (dii = 0; dii <= tlen; ++dii) - { - for (djj = 0; djj <= qlen; ++djj) - { - ins[dii][djj] = del[dii][djj] = score[dii][djj] = 0; - } - } - ins[0][0] = del[0][0] = score[0][0] = init_score; - ins[0][1] = MAX(0, init_score - (o_ins + e_ins)); - del[1][0] = MAX(0, init_score - (o_del + e_del)); - score[0][1] = ins[0][1]; - score[1][0] = del[1][0]; - // fprintf(stderr, "%d %d\n", del[1][0], score[1][0]); -#endif - - for (D = 1; LIKELY(D < Dloop); ++D) - { - if (D < tlen) - beg1 = 1; - else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - // beg1 = MAX(D - window_size, beg1); - // end1 = MIN(D + window_size, end1); - // beg = MAX(beg1, beg); - // end = MIN(end1, end); - // if (beg > end) - // break; - - beg = beg1; - end = end1; - - iend = D - beg; // ref开始计算的位置,倒序 - span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 - - // fprintf(stderr, "D:%d, jbeg:%d, jend:%d, ibeg:%d, iend:%d\n", D, beg, end, ibeg, iend); - - // 每一轮需要记录的数据 - int m = 0, mj = -1, mi = -1; - max_vec = zero_vec; - - // 处理左边界 - if (beg == 1) - { - hA0[0] = eA2[0]; - mA1[0] = 0; - eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); -#ifdef DEBUG_OUT - del[iend + 1][0] = eA1[0]; - score[iend + 1][0] = eA1[0]; -#endif - } -#ifdef DEBUG_OUT - // fprintf(stderr, "eA1: %d\n", eA1[0]); - // for (djj = beg - 1; djj < end; ++djj) - //{ - // fprintf(stderr, "%d ", hA0[djj]); - //} - // fprintf(stderr, "\n"); -#endif - for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 存储结果 - SIMD_STORE; - } - // 剩下的计算单元 - if (j <= end) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 去除多余计算的部分 - SIMD_REMOVE_EXTRA; - // 存储结果 - SIMD_STORE; - } - // 处理上边界 - if (ibeg == 0) - { - fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - hA2[end + 1] = fA2[end + 1]; - mA2[end + 1] = 0; -#ifdef DEBUG_OUT - ins[0][end + 1] = fA2[end + 1]; - score[0][end + 1] = fA2[end + 1]; -#endif - } - - SIMD_FIND_MAX; - -#ifdef DEBUG_OUT - for (djj = beg; djj <= end; ++djj) - { - dii = D - djj + 1; - // fprintf(stderr, "dii:%d, djj:%d, ", dii, djj); - ins[dii][djj] = fA2[djj]; - del[dii][djj] = eA2[djj]; - score[dii][djj] = hA2[djj]; - } - // fprintf(stderr, "\n"); - // fprintf(stderr, "%d, %d\n", hA2[0], hA2[1]); -#endif - - // 注意最后跳出循环j的值 - j = end + 1; - - if (j == qlen + 1) - { - max_ie = gscore > hA2[qlen] ? max_ie : ibeg; - gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; - } - - if (m > max) - { - max = m, max_i = mi, max_j = mj; - max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); - } - - // 调整计算的边界 - // for (j = beg; LIKELY(j <= end); ++j) - //{ - // int has_val = hA1[j - 1] | hA2[j]; - // if (has_val) - // break; - //} - // beg = j; - // for (j = end + 1; LIKELY(j >= beg); --j) - //{ - // int has_val = hA1[j - 1] | hA2[j]; - // if (has_val) - // break; - //} - // end = j + 1 <= qlen ? j + 1 : qlen; - // beg = 0; - // end = qlen; // uncomment this line for debugging - - // swap m, h, e, f - SWAP_DATA_POINTER; - } - -#ifdef DEBUG_OUT - for (dii = 0; dii <= tlen; ++dii) - { - for (djj = 0; djj <= qlen; ++djj) - { - fprintf(score_f_arr[1], "%-4d", score[dii][djj]); - fprintf(ins_ext_f_arr[1], "%-4d", ins[dii][djj]); - fprintf(del_ext_f_arr[1], "%-4d", del[dii][djj]); - } - fprintf(score_f_arr[1], "\n"); - fprintf(ins_ext_f_arr[1], "\n"); - fprintf(del_ext_f_arr[1], "\n"); - } -#endif - - // free(mem); - thread_mem_release(tmem, mem_size); - if (_qle) - *_qle = max_j + 1; - if (_tle) - *_tle = max_i + 1; - if (_gtle) - *_gtle = max_ie + 1; - if (_gscore) - *_gscore = gscore; - if (_max_off) - *_max_off = max_off; - return max; -} diff --git a/ksw_ext_avx2_aligned.c b/ksw_ext_avx2_aligned.c deleted file mode 100644 index 9951ee9..0000000 --- a/ksw_ext_avx2_aligned.c +++ /dev/null @@ -1,445 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "thread_mem.h" -#include "common.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -#undef MAX -#undef MIN -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -#define SIMD_WIDTH 16 -#define BASE_BYTES 2 -#define SCORE_BYTES 2 -#define BOUNDARY_SCORE_NUM 2 -#define TMP_SCORE_ARRAY_NUM 9 -#define MEM_ALIGN_BYTES 32 -#define ALIGN_SHIFT_BITS 5 -#define SIMD_BYTES 32 - -#define AMBIGUOUS_BASE_CODE 4 -#define AMBIGUOUS_BASE_SCORE -1 - -// 32字节对齐(256位) -#define align_mem(x) (((x) + 31) >> 5 << 5) -#define align_number(x) align_mem(x) - -/* 去掉多余计算的值 */ -static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { - {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}; - -// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -#define permute_mask 27 -// 初始化变量 -#define SIMD_INIT \ - int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ - __m256i zero_vec; \ - __m256i max_vec, last_max_vec = _mm256_set1_epi16(init_score); \ - __m256i oe_del_vec; \ - __m256i oe_ins_vec; \ - __m256i e_del_vec; \ - __m256i e_ins_vec; \ - __m256i h_vec_mask[SIMD_WIDTH]; \ - zero_vec = _mm256_setzero_si256(); \ - oe_del_vec = _mm256_set1_epi16(-oe_del); \ - oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ - e_del_vec = _mm256_set1_epi16(-e_del); \ - e_ins_vec = _mm256_set1_epi16(-e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi16(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi16(-base_mis_score); \ - __m256i amb_sc_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_SCORE); \ - __m256i amb_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_CODE); \ - for (i = 0; i < SIMD_WIDTH; ++i) \ - h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); - -/* - * e 表示当前ref的碱基被删除 - * f 表示当前seq的碱基插入 - * m 表示当前碱基匹配(可以相等,也可以不想等) - * h 表示最大值 - */ -// load向量化数据 -#define SIMD_LOAD \ - __m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \ - __m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \ - __m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \ - __m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \ - __m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \ - __m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j])); \ - __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i])); - -// 比对ref和seq的序列,计算罚分 -#define SIMD_CMP_SEQ \ - /* 将待比对的target序列逆序排列 */ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ - __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \ - __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \ - __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \ - score_vec = _mm256_or_si256(score_vec, mis_score_vec); \ - /* 计算模棱两可的字符(N)的位置的分数 */ \ - __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \ - __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \ - __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ - score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \ - __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ - score_vec = _mm256_or_si256(score_vec, amb_score_vec); - -// 向量化计算h, e, f, m -#define SIMD_COMPUTE \ - __m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \ - __m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \ - __m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \ - __m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \ - __m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \ - __m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \ - __m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \ - __m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \ - __m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \ - __m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \ - __m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \ - en_vec = _mm256_max_epi16(en_vec, zero_vec); \ - fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \ - mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \ - hn_vec = _mm256_max_epi16(hn_vec, zero_vec); - -#define SIMD_STORE \ - max_vec = _mm256_max_epu8(max_vec, hn_vec); \ - _mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \ - _mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \ - _mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \ - _mm256_storeu_si256((__m256i *)&hA2[j], hn_vec); - -// 去除多余的部分 -#define SIMD_REMOVE_EXTRA \ - en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \ - fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \ - mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ - hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); - -// 找最大值和位置 -#define SIMD_FIND_MAX \ - __m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \ - uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ - if (cmp_result > 0) \ - { \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - int16_t *maxVal = (int16_t *)&max_vec; \ - m = maxVal[0]; \ - for (j = aligned_read_start_pos, i = aligned_ref_end_pos; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ - { \ - if (seq[mj + 2] == ref[mi + 1 + SIMD_WIDTH]) \ - { \ - m += base_match_score; \ - } \ - else \ - { \ - break; \ - } \ - } \ - } \ - } \ - last_max_vec = _mm256_set1_epi16(m); \ - } - -// 每轮迭代后,交换数组 -#define SWAP_DATA_POINTER \ - int16_t *tmp = hA0; \ - hA0 = hA1; \ - hA1 = hA2; \ - hA2 = tmp; \ - tmp = eA1; \ - eA1 = eA2; \ - eA2 = tmp; \ - tmp = fA1; \ - fA1 = fA2; \ - fA2 = tmp; \ - tmp = mA1; \ - mA1 = mA2; \ - mA2 = tmp; - -int ksw_extend_avx2_aligned(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 -{ - int16_t *mA1, *mA2, - *hA0, *hA1, *hA2, - *eA1, *eA2, - *fA1, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M - int16_t *seq, *ref; - uint8_t *mem_addr; - - int read_size = align_number(qlen * BASE_BYTES + MEM_ALIGN_BYTES); - int ref_size = align_number((tlen + SIMD_WIDTH) * BASE_BYTES); - int back_diagnal_num = tlen + qlen; // 循环跳出条件 D从1开始遍历 - int score_array_size = align_number((qlen + BOUNDARY_SCORE_NUM) * SCORE_BYTES); - int score_element_num = score_array_size / SCORE_BYTES; - int score_mem_size = score_array_size * TMP_SCORE_ARRAY_NUM; - int request_mem_size = read_size + ref_size + score_mem_size + MEM_ALIGN_BYTES * 3; // 左侧内存地址对齐 + 数据向左偏移一个元素 + 末尾SIMD补齐 - - int i, ibeg, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - int span, beg1, end1; // 边界条件计算 - int aligned_read_start_pos, aligned_ref_end_pos; - int iend; - - SIMD_INIT; // 初始化simd用的数据 - - assert(init_score > 0); - - // allocate memory - mem_addr = thread_mem_request(tmem, request_mem_size); - mem_addr = (void *)align_mem((uint64_t)mem_addr); - ref = (int16_t *)&mem_addr[0]; - seq = (int16_t *)(mem_addr + ref_size + SIMD_BYTES - BASE_BYTES); - if (extend_left) - { - for (i = 0; i < qlen; ++i) - seq[i + 1] = query[qlen - 1 - i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; - } - else - { - for (i = 0; i < qlen; ++i) - seq[i + 1] = query[i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[i]; - } - - mem_addr += read_size + ref_size + (SIMD_BYTES - SCORE_BYTES); - for (i = 0; i < score_mem_size; i += SIMD_BYTES) - { - _mm256_storeu_si256((__m256i *)&mem_addr[i], zero_vec); - } - - hA0 = (int16_t *)&mem_addr[0]; - hA1 = &hA0[score_element_num]; - hA2 = &hA1[score_element_num]; - mA1 = &hA2[score_element_num]; - mA2 = &mA1[score_element_num]; - eA1 = &mA2[score_element_num]; - eA2 = &eA1[score_element_num]; - fA1 = &eA2[score_element_num]; - fA2 = &fA1[score_element_num]; - - // adjust $window_size if it is too large - // get the max score - max = base_match_score; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); - max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? - if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); - - // DP loop - max = init_score, max_i = max_j = -1; - max_ie = -1, gscore = -1; - ; - max_off = 0; - beg = 1; - end = qlen; - // init init_score - hA0[0] = init_score; // 左上角 - fA1[1] = MAX(0, init_score - (o_ins + e_ins)); - eA2[0] = init_score; - hA1[1] = fA1[1]; - - if (qlen == 0 || tlen == 0) - back_diagnal_num = 0; // 防止意外情况 - if (window_size >= qlen) - { - max_ie = 0; - gscore = 0; - } - - for (D = 1; LIKELY(D < back_diagnal_num); ++D) - { - if (D < tlen) - beg1 = 1; - else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - beg1 = MAX(D - window_size, beg1); - end1 = MIN(D + window_size, end1); - - beg = MAX(beg1, beg); - end = MIN(end1, end); - if (beg > end) - break; - - // beg = beg1; - // end = end1; - - iend = D - beg; // ref开始计算的位置,倒序 - span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 - - // 每一轮需要记录的数据 - int m = 0, mj = -1, mi = -1; - max_vec = zero_vec; - // 处理左边界 - if (beg == 1) - { - hA0[0] = eA2[0]; - mA1[0] = 0; - eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); - } - - // aligned_read_start_pos = (beg >> ALIGN_SHIFT_BITS << ALIGN_SHIFT_BITS) + 1; - // aligned_ref_end_pos = iend + (beg - aligned_read_start_pos); - - aligned_read_start_pos = beg; - aligned_ref_end_pos = iend; - - // fprintf(stderr, "%d\t%d\n", beg, aligned_read_start_pos); - - for (j = aligned_read_start_pos, i = aligned_ref_end_pos; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 存储结果 - SIMD_STORE; - } - // 剩下的计算单元 - if (j <= end) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 去除多余计算的部分 - SIMD_REMOVE_EXTRA; - // 存储结果 - SIMD_STORE; - } - // 处理上边界 - if (ibeg == 0) - { - fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - hA2[end + 1] = fA2[end + 1]; - mA2[end + 1] = 0; - } - - SIMD_FIND_MAX; - - // 注意最后跳出循环j的值 - j = end + 1; - - if (j == qlen + 1) // 遍历到了query最后一个碱基,此时next_max_arr[qlen]为全局匹配的最大分值 - { - max_ie = gscore > hA2[qlen] ? max_ie : ibeg; - gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; - } - if (m > max) - { - max = m, max_i = mi, max_j = mj; - max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); - } - - // 调整计算的边界 - for (j = beg; LIKELY(j <= end); ++j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - { - break; - } - } - beg = j; - - for (j = end + 1; LIKELY(j >= beg); --j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - { - break; - } - } - end = j + 1 <= qlen ? j + 1 : qlen; - // swap m, h, e, f - SWAP_DATA_POINTER; - } - - thread_mem_release(tmem, request_mem_size); - if (_qle) - *_qle = max_j + 1; - if (_tle) - *_tle = max_i + 1; - if (_gtle) - *_gtle = max_ie + 1; - if (_gscore) - *_gscore = gscore; - if (_max_off) - *_max_off = max_off; - return max; -} \ No newline at end of file diff --git a/ksw_ext_avx2_heuristics.c b/ksw_ext_avx2_heuristics.c deleted file mode 100644 index 9c2050a..0000000 --- a/ksw_ext_avx2_heuristics.c +++ /dev/null @@ -1,431 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "thread_mem.h" -#include "common.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -#undef MAX -#undef MIN -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -#define SIMD_WIDTH 16 - -#define AMBIGUOUS_BASE_CODE 4 -#define AMBIGUOUS_BASE_SCORE -1 - -/* 去掉多余计算的值 */ -static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { - {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, - {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}; - -// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -#define permute_mask 27 -// 初始化变量 -#define SIMD_INIT \ - int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ - __m256i zero_vec; \ - __m256i max_vec, last_max_vec = _mm256_set1_epi16(init_score); \ - __m256i oe_del_vec; \ - __m256i oe_ins_vec; \ - __m256i e_del_vec; \ - __m256i e_ins_vec; \ - __m256i h_vec_mask[SIMD_WIDTH]; \ - zero_vec = _mm256_setzero_si256(); \ - oe_del_vec = _mm256_set1_epi16(-oe_del); \ - oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ - e_del_vec = _mm256_set1_epi16(-e_del); \ - e_ins_vec = _mm256_set1_epi16(-e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi16(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi16(-base_mis_score); \ - __m256i amb_sc_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_SCORE); \ - __m256i amb_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_CODE); \ - for (i = 0; i < SIMD_WIDTH; ++i) \ - h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); - -/* - * e 表示当前ref的碱基被删除 - * f 表示当前seq的碱基插入 - * m 表示当前碱基匹配(可以相等,也可以不想等) - * h 表示最大值 - */ -// load向量化数据 -#define SIMD_LOAD \ - __m256i m1 = _mm256_loadu_si256((__m256i *)(&cur_match_arr[j])); \ - __m256i e1 = _mm256_loadu_si256((__m256i *)(&cur_del_arr[j])); \ - __m256i m1j1 = _mm256_loadu_si256((__m256i *)(&cur_match_arr[j - 1])); \ - __m256i f1j1 = _mm256_loadu_si256((__m256i *)(&cur_ins_arr[j - 1])); \ - __m256i h0j1 = _mm256_loadu_si256((__m256i *)(&last_max_arr[j - 1])); \ - __m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j - 1])); \ - __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i])); - -// 比对ref和seq的序列,计算罚分 -#define SIMD_CMP_SEQ \ - /* 将待比对的target序列逆序排列 */ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ - __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \ - __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \ - __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \ - score_vec = _mm256_or_si256(score_vec, mis_score_vec); \ - /* 计算模棱两可的字符(N)的位置的分数 */ \ - __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \ - __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \ - __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ - score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \ - __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ - score_vec = _mm256_or_si256(score_vec, amb_score_vec); - -// 向量化计算h, e, f, m -#define SIMD_COMPUTE \ - __m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \ - __m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \ - __m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \ - __m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \ - __m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \ - __m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \ - __m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \ - __m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \ - __m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \ - __m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \ - __m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \ - en_vec = _mm256_max_epi16(en_vec, zero_vec); \ - fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \ - mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \ - hn_vec = _mm256_max_epi16(hn_vec, zero_vec); - -// 存储向量化结果 -#define SIMD_STORE \ - max_vec = _mm256_max_epu8(max_vec, hn_vec); \ - _mm256_storeu_si256((__m256i *)&next_del_arr[j], en_vec); \ - _mm256_storeu_si256((__m256i *)&next_ins_arr[j], fn_vec); \ - _mm256_storeu_si256((__m256i *)&next_match_arr[j], mn_vec); \ - _mm256_storeu_si256((__m256i *)&next_max_arr[j], hn_vec); - -// 去除多余的部分 -#define SIMD_REMOVE_EXTRA \ - en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \ - fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \ - mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ - hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); - -// 找最大值和位置 -// last_max_vec = max_vec; -#define SIMD_FIND_MAX \ - __m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \ - uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ - if (cmp_result > 0) \ - { \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - int16_t *maxVal = (int16_t *)&max_vec; \ - m = maxVal[0]; \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&next_max_arr[j])); \ - __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ - { \ - if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \ - { \ - m += base_match_score; \ - } \ - else \ - { \ - break; \ - } \ - } \ - } \ - } \ - last_max_vec = _mm256_set1_epi16(m); \ - } - -// 每轮迭代后,交换数组 -#define SWAP_DATA_POINTER \ - int16_t *tmp = last_max_arr; \ - last_max_arr = cur_max_arr; \ - cur_max_arr = next_max_arr; \ - next_max_arr = tmp; \ - tmp = cur_del_arr; \ - cur_del_arr = next_del_arr; \ - next_del_arr = tmp; \ - tmp = cur_ins_arr; \ - cur_ins_arr = next_ins_arr; \ - next_ins_arr = tmp; \ - tmp = cur_match_arr; \ - cur_match_arr = next_match_arr; \ - next_match_arr = tmp; - -int ksw_extend_avx2_heuristics(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 -{ - // return 0; - int16_t *mA, *eA, *hA, *fA, - *cur_match_arr, *next_match_arr, - *last_max_arr, *cur_max_arr, *next_max_arr, - *cur_del_arr, *next_del_arr, - *cur_ins_arr, *next_ins_arr; // hA0保存上上个col的H,其他的保存上个H E F M - int16_t *seq, *ref; - uint8_t *mem; - int16_t *qtmem, *vmem; - int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH; - int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - int Dloop = tlen + qlen; // 循环跳出条件 D从1开始遍历 - int span, beg1, end1; // 边界条件计算 - int col_size = qlen + 2 + SIMD_WIDTH; - int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 - int mem_size = (seq_size + ref_size) * 2 + val_mem_size; - - SIMD_INIT; // 初始化simd用的数据 - - assert(init_score > 0); - - // allocate memory - mem = thread_mem_request(tmem, mem_size); - qtmem = (int16_t *)&mem[0]; - seq = &qtmem[0]; - ref = &qtmem[seq_size]; - if (extend_left) - { - for (i = 0; i < qlen; ++i) - seq[i] = query[qlen - 1 - i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; - } - else - { - for (i = 0; i < qlen; ++i) - seq[i] = query[i]; - for (i = 0; i < tlen; ++i) - ref[i + SIMD_WIDTH - 1] = target[i]; - } - - vmem = &ref[ref_size]; - for (i = 0; i < (val_mem_size >> 1); i += SIMD_WIDTH) - { - _mm256_storeu_si256((__m256i *)&vmem[i], zero_vec); - } - hA = &vmem[0]; - mA = &vmem[col_size * 3]; - eA = &vmem[col_size * 5]; - fA = &vmem[col_size * 7]; - - last_max_arr = &hA[0]; - cur_max_arr = &hA[col_size]; - next_max_arr = &cur_max_arr[col_size]; - cur_match_arr = &mA[0]; - next_match_arr = &mA[col_size]; - cur_del_arr = &eA[0]; - next_del_arr = &eA[col_size]; - cur_ins_arr = &fA[0]; - next_ins_arr = &fA[col_size]; - - // adjust $window_size if it is too large - // get the max score - max = base_match_score; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); - max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? - if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); - - // DP loop - max = init_score, max_i = max_j = -1; - max_ie = -1, gscore = -1; - - max_off = 0; - beg = 1; - end = qlen; - // init init_score - last_max_arr[0] = init_score; // 左上角 - cur_ins_arr[1] = MAX(0, init_score - (o_ins + e_ins)); - next_del_arr[0] = init_score; - cur_max_arr[1] = cur_ins_arr[1]; - - if (qlen == 0 || tlen == 0) - Dloop = 0; // 防止意外情况 - if (window_size >= qlen) - { - max_ie = 0; - gscore = 0; - } - - for (D = 1; LIKELY(D < Dloop); ++D) - { - if (D < tlen) - beg1 = 1; - else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - beg1 = MAX(D - window_size, beg1); - end1 = MIN(D + window_size, end1); - - beg = MAX(beg1, beg); - end = MIN(end1, end); - if (beg > end) - break; - - // beg = beg1; - // end = end1; - - iend = D - beg; // ref开始计算的位置,倒序 - span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 - - // 每一轮需要记录的数据 - int m = 0, mj = -1, mi = -1; - max_vec = zero_vec; - - // 处理左边界 - if (beg == 1) - { - last_max_arr[0] = next_del_arr[0]; - cur_match_arr[0] = 0; - cur_del_arr[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); - } - - for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 存储结果 - SIMD_STORE; - } - // 剩下的计算单元 - if (j <= end) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 去除多余计算的部分 - SIMD_REMOVE_EXTRA; - // 存储结果 - SIMD_STORE; - } - // 处理上边界 - if (ibeg == 0) - { - next_ins_arr[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - next_max_arr[end + 1] = next_ins_arr[end + 1]; - next_match_arr[end + 1] = 0; - } - - SIMD_FIND_MAX; - - // 注意最后跳出循环j的值 - j = end + 1; - - if (j == qlen + 1) - { - max_ie = gscore > next_max_arr[qlen] ? max_ie : ibeg; - gscore = gscore > next_max_arr[qlen] ? gscore : next_max_arr[qlen]; - } - if (m > max) - { - max = m, max_i = mi, max_j = mj; - max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); - } - - for (j = beg; LIKELY(j <= end); ++j) - { - int has_val = cur_max_arr[j - 1] | next_max_arr[j]; - if (has_val) - { - break; - } - } - beg = j; - for (j = end + 1; LIKELY(j >= beg); --j) - { - int has_val = cur_max_arr[j - 1] | next_max_arr[j]; - if (has_val) - { - break; - } - } - end = j + 1 <= qlen ? j + 1 : qlen; - - // swap m, h, e, f - SWAP_DATA_POINTER; - } - - // free(mem); - thread_mem_release(tmem, mem_size); - if (_qle) - *_qle = max_j + 1; - if (_tle) - *_tle = max_i + 1; - if (_gtle) - *_gtle = max_ie + 1; - if (_gscore) - *_gscore = gscore; - if (_max_off) - *_max_off = max_off; - return max; -} diff --git a/ksw_ext_avx2_u8_aligned.c b/ksw_ext_avx2_u8_aligned.c deleted file mode 100644 index 1aad002..0000000 --- a/ksw_ext_avx2_u8_aligned.c +++ /dev/null @@ -1,459 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "thread_mem.h" -#include "common.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -#undef MAX -#undef MIN -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -#define SIMD_WIDTH 32 - -#define BASE_BYTES 1 -#define SCORE_BYTES 1 -#define BOUNDARY_SCORE_NUM 2 -#define TMP_SCORE_ARRAY_NUM 9 -#define MEM_ALIGN_BYTES 32 -#define ALIGN_SHIFT_BITS 5 -#define SIMD_BYTES 32 - -#define AMBIGUOUS_BASE_CODE 4 -#define AMBIGUOUS_BASE_SCORE -1 - -// 32字节对齐(256位) -#define align_mem(x) (((x) + 31) >> 5 << 5) -#define align_number(x) align_mem(x) - -static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { - {0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; - -static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - -// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) -#define permute_mask 27 -// 初始化变量 -#define SIMD_INIT \ - int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ - __m256i zero_vec; \ - __m256i max_vec, last_max_vec = _mm256_set1_epi8(init_score); \ - __m256i oe_del_vec; \ - __m256i oe_ins_vec; \ - __m256i e_del_vec; \ - __m256i e_ins_vec; \ - __m256i h_vec_mask[SIMD_WIDTH]; \ - __m256i reverse_mask_vec; \ - zero_vec = _mm256_setzero_si256(); \ - oe_del_vec = _mm256_set1_epi8(oe_del); \ - oe_ins_vec = _mm256_set1_epi8(oe_ins); \ - e_del_vec = _mm256_set1_epi8(e_del); \ - e_ins_vec = _mm256_set1_epi8(e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi8(base_match_score); \ - __m256i mis_sc_vec = _mm256_set1_epi8(base_mis_score); \ - __m256i amb_sc_vec = _mm256_set1_epi8(1); \ - __m256i amb_vec = _mm256_set1_epi8(4); \ - reverse_mask_vec = _mm256_loadu_si256((__m256i *)(reverse_mask)); \ - for (i = 0; i < SIMD_WIDTH; ++i) \ - h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); - -/* - * e 表示当前ref的碱基被删除 - * f 表示当前seq的碱基插入 - * m 表示当前碱基匹配(可以相等,也可以不想等) - * h 表示最大值 - */ -// load向量化数据 -#define SIMD_LOAD \ - __m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \ - __m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \ - __m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \ - __m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \ - __m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \ - __m256i qs_vec = _mm256_loadu_si256((__m256i *)(&read_seq[j])); \ - __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref_seq[i])); - -// 比对ref和seq的序列,计算罚分 -#define SIMD_CMP_SEQ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shuffle_epi8(ts_vec, reverse_mask_vec); \ - __m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \ - __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \ - __m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \ - __m256i q_amb_mask_vec = _mm256_cmpeq_epi8(qs_vec, amb_vec); \ - __m256i t_amb_mask_vec = _mm256_cmpeq_epi8(ts_vec, amb_vec); \ - __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ - __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ - mis_score_vec = _mm256_andnot_si256(amb_mask_vec, mis_score_vec); \ - mis_score_vec = _mm256_or_si256(amb_score_vec, mis_score_vec); \ - match_score_vec = _mm256_andnot_si256(amb_mask_vec, match_score_vec); - -// 向量化计算h, e, f, m -#define SIMD_COMPUTE \ - __m256i en_vec0 = _mm256_max_epu8(m1, oe_del_vec); \ - en_vec0 = _mm256_subs_epu8(en_vec0, oe_del_vec); \ - __m256i en_vec1 = _mm256_max_epu8(e1, e_del_vec); \ - en_vec1 = _mm256_subs_epu8(en_vec1, e_del_vec); \ - __m256i en_vec = _mm256_max_epu8(en_vec0, en_vec1); \ - __m256i fn_vec0 = _mm256_max_epu8(m1j1, oe_ins_vec); \ - fn_vec0 = _mm256_subs_epu8(fn_vec0, oe_ins_vec); \ - __m256i fn_vec1 = _mm256_max_epu8(f1j1, e_ins_vec); \ - fn_vec1 = _mm256_subs_epu8(fn_vec1, e_ins_vec); \ - __m256i fn_vec = _mm256_max_epu8(fn_vec0, fn_vec1); \ - __m256i mn_vec0 = _mm256_adds_epu8(h0j1, match_score_vec); \ - mn_vec0 = _mm256_max_epu8(mn_vec0, mis_score_vec); \ - mn_vec0 = _mm256_subs_epu8(mn_vec0, mis_score_vec); \ - __m256i mn_mask = _mm256_cmpeq_epi8(h0j1, zero_vec); \ - __m256i mn_vec = _mm256_andnot_si256(mn_mask, mn_vec0); \ - __m256i hn_vec0 = _mm256_max_epu8(en_vec, fn_vec); \ - __m256i hn_vec = _mm256_max_epu8(hn_vec0, mn_vec); - -// 存储向量化结果 -#define SIMD_STORE \ - max_vec = _mm256_max_epu8(max_vec, hn_vec); \ - _mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \ - _mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \ - _mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \ - _mm256_storeu_si256((__m256i *)&hA2[j], hn_vec); - -// 去除多余的部分 -#define SIMD_REMOVE_EXTRA \ - en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \ - fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \ - mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ - hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); - -// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max); -// last_max_vec = _mm256_set1_epi8(m); -// 找最大值和位置 -#define SIMD_FIND_MAX \ - __m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec); \ - cmp_max = _mm256_xor_si256(cmp_max, last_max_vec); \ - cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec); \ - uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ - if (cmp_result != 4294967295) \ - { \ - uint8_t *maxVal = (uint8_t *)&max_vec; \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - m = maxVal[0]; \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ - { \ - if (read_seq[mj + 2] == ref_seq[mi + 1 + SIMD_WIDTH]) \ - { \ - m += base_match_score; \ - } \ - else \ - { \ - break; \ - } \ - } \ - } \ - } \ - last_max_vec = _mm256_set1_epi8(m); \ - } - -// 每轮迭代后,交换数组 -#define SWAP_DATA_POINTER \ - uint8_t *tmp = hA0; \ - hA0 = hA1; \ - hA1 = hA2; \ - hA2 = tmp; \ - tmp = eA1; \ - eA1 = eA2; \ - eA2 = tmp; \ - tmp = fA1; \ - fA1 = fA2; \ - fA2 = tmp; \ - tmp = mA1; \ - mA1 = mA2; \ - mA2 = tmp; - -int ksw_extend_avx2_u8_aligned(thread_mem_t *tmem, - int qlen, // query length 待匹配段碱基的query长度 - const uint8_t *query, // read碱基序列 - int tlen, // target length reference的长度 - const uint8_t *target, // reference序列 - int extend_left, // 是不是向左扩展 - int o_del, // deletion 错配开始的惩罚系数 - int e_del, // deletion extension的惩罚系数 - int o_ins, // insertion 错配开始的惩罚系数 - int e_ins, // insertion extension的惩罚系数SIMD_BTYES - int base_match_score, // 碱基match时的分数 - int base_mis_score, // 碱基mismatch时的惩罚分数(正数) - int window_size, // 提前剪枝系数,w =100 匹配位置和beg的最大距离 - int end_bonus, // 如果query比对到了最后一个字符,额外奖励分值 - int init_score, // 该seed的初始得分(完全匹配query的碱基数) - int *_qle, // 匹配得到全局最大得分的碱基在query的位置 - int *_tle, // 匹配得到全局最大得分的碱基在reference的位置 - int *_gtle, // query全部匹配上的target的长度 - int *_gscore, // query的端到端匹配得分 - int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 -{ - uint8_t *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M - uint8_t *read_seq, *ref_seq; - int i, ibeg, iend, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - int span, beg1, end1; // 边界条件计算 - - uint8_t *mem_addr; - int read_size = align_number(qlen * BASE_BYTES + MEM_ALIGN_BYTES); - int ref_size = align_number((tlen + SIMD_WIDTH) * BASE_BYTES); - int back_diagnal_num = tlen + qlen; // 循环跳出条件 D从1开始遍历 - int score_array_size = align_number((qlen + BOUNDARY_SCORE_NUM) * SCORE_BYTES); - int score_element_num = score_array_size / SCORE_BYTES; - int score_mem_size = score_array_size * TMP_SCORE_ARRAY_NUM; - int request_mem_size = read_size + ref_size + score_mem_size + MEM_ALIGN_BYTES * 3; - - SIMD_INIT; // 初始化simd用的数据 - - assert(init_score > 0); - - mem_addr = thread_mem_request(tmem, request_mem_size); - mem_addr = (void *)align_mem((uint64_t)mem_addr); - ref_seq = (uint8_t *)&mem_addr[0]; - read_seq = (uint8_t *)(mem_addr + ref_size + SIMD_BYTES - BASE_BYTES); - - if (extend_left) - { - for (i = 0; i < qlen; ++i) - read_seq[i + 1] = query[qlen - 1 - i]; - for (i = 0; i < tlen; ++i) - ref_seq[i + SIMD_WIDTH - 1] = target[tlen - 1 - i]; - } - else - { - for (i = 0; i < qlen; ++i) - read_seq[i + 1] = query[i]; - for (i = 0; i < tlen; ++i) - ref_seq[i + SIMD_WIDTH - 1] = target[i]; - } - - mem_addr += read_size + ref_size; - for (i = 0; i <= score_mem_size; i += SIMD_BYTES) - { - _mm256_storeu_si256((__m256i *)&mem_addr[i], zero_vec); - } - mem_addr += SIMD_BYTES - SCORE_BYTES; - - hA0 = (uint8_t *)&mem_addr[0]; - hA1 = &hA0[score_element_num]; - hA2 = &hA1[score_element_num]; - mA1 = &hA2[score_element_num]; - mA2 = &mA1[score_element_num]; - eA1 = &mA2[score_element_num]; - eA2 = &eA1[score_element_num]; - fA1 = &eA2[score_element_num]; - fA2 = &fA1[score_element_num]; - - // adjust $window_size if it is too large - - max = base_match_score; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); - max_ins = max_ins > 1 ? max_ins : 1; - window_size = window_size < max_ins ? window_size : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1 ? max_del : 1; - window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary? - if (tlen < qlen) - window_size = MIN(tlen - 1, window_size); - - // DP loop - max = init_score, max_i = max_j = -1; - max_ie = -1, gscore = -1; - - max_off = 0; - beg = 1; - end = qlen; - // init init_score - hA0[0] = init_score; // 左上角 - fA1[1] = MAX(0, init_score - (o_ins + e_ins)); - eA2[0] = init_score; - hA1[1] = fA1[1]; - - if (qlen == 0 || tlen == 0) - back_diagnal_num = 0; // 防止意外情况 - if (window_size >= qlen) - { - max_ie = 0; - gscore = 0; - } - - for (D = 1; LIKELY(D < back_diagnal_num); ++D) - { - // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 - if (D < tlen) - beg1 = 1; - else - beg1 = D - tlen + 1; - if (D < qlen) - end1 = D; // 闭区间 - else - end1 = qlen; - beg1 = MAX(D - window_size, beg1); - end1 = MIN(D + window_size, end1); - - beg = MAX(beg1, beg); - end = MIN(end1, end); - if (beg > end) - break; - - // beg = beg1; - // end = end1; - - iend = D - beg; // ref开始计算的位置,倒序 - span = end - beg; - ibeg = iend - span; // 0开始的ref索引位置 - - // 每一轮需要记录的数据 - int m = 0, mj = -1, mi = -1; - max_vec = zero_vec; - - // 处理左边界 - if (beg == 1) - { - hA0[0] = eA2[0]; - mA1[0] = 0; - eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1))); - } - - for (j = beg, i = iend; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 存储结果 - SIMD_STORE; - } - // 剩下的计算单元 - if (j <= end) - { - // 取数据 - SIMD_LOAD; - // 比对seq,计算罚分 - SIMD_CMP_SEQ; - // 计算 - SIMD_COMPUTE; - // 去除多余计算的部分 - SIMD_REMOVE_EXTRA; - // 存储结果 - SIMD_STORE; - } - - // 处理上边界 - if (ibeg == 0) - { - fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1))); - hA2[end + 1] = fA2[end + 1]; - mA2[end + 1] = 0; - } - SIMD_FIND_MAX; - - // 注意最后跳出循环j的值 - j = end + 1; - - if (j == qlen + 1) - { - max_ie = gscore > hA2[qlen] ? max_ie : ibeg; - gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; - } - - if (m > max) - { - max = m, max_i = mi, max_j = mj; - max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); - } - - // 调整计算的边界 - for (j = beg; LIKELY(j <= end); ++j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - break; - } - beg = j; - for (j = end + 1; LIKELY(j >= beg); --j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - break; - } - end = j + 1 <= qlen ? j + 1 : qlen; - - // swap m, h, e, f - SWAP_DATA_POINTER; - } - - thread_mem_release(tmem, request_mem_size); - if (_qle) - *_qle = max_j + 1; - if (_tle) - *_tle = max_i + 1; - if (_gtle) - *_gtle = max_ie + 1; - if (_gscore) - *_gscore = gscore; - if (_max_off) - *_max_off = max_off; - return max; -} diff --git a/ksw_ext_cuda.c b/ksw_ext_cuda.c deleted file mode 100644 index 6ec9cdb..0000000 --- a/ksw_ext_cuda.c +++ /dev/null @@ -1 +0,0 @@ -#include "common.h" \ No newline at end of file diff --git a/ksw_ext_normal.c b/ksw_ext_normal.c deleted file mode 100644 index c0facc2..0000000 --- a/ksw_ext_normal.c +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include -#include -#include -#include "thread_mem.h" -#include "common.h" - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -typedef struct -{ - int32_t h, e; -} eh_t; - -int ksw_extend_normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) -{ - // return h0; - eh_t *eh; // score array - int8_t *qp; // query profile - int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - assert(h0 > 0); - // qp = malloc(qlen * m); - // eh = calloc(qlen + 1, 8); - qp = thread_mem_request(tmem, qlen * m); - eh = thread_mem_request_and_clean(tmem, (qlen + 1) * 8); - // generate the query profile - for (k = i = 0; k < m; ++k) - { - const int8_t *p = &mat[k * m]; - for (j = 0; j < qlen; ++j) - qp[i++] = p[query[j]]; // 对于qp数组,第0到qlen个元素,表示和A比对的分值,第qlen到2*qlen,表示和C比对的分值,以此类推 - } - // fill the first row - // 初始化第一行分值 - eh[0].h = h0; - eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0; - for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j) - eh[j].h = eh[j - 1].h - e_ins; - // adjust $w if it is too large - k = m * m; // 字符矩阵 - for (i = 0, max = 0; i < k; ++i) // get the max score - max = max > mat[i] ? max : mat[i]; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); // 最大可插入的长度? - max_ins = max_ins > 1 ? max_ins : 1; - w = w < max_ins ? w : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1 ? max_del : 1; - w = w < max_del ? w : max_del; // TODO: is this necessary? 上述几行代码都是为了看看能否缩小窗口,减小计算 - // DP loop - max = h0, max_i = max_j = -1; - max_ie = -1, gscore = -1; - max_off = 0; - beg = 0, end = qlen; -#ifdef DEBUG_OUT - fprintf(score_f_arr[0], "%-4d", h0); - fprintf(ins_ext_f_arr[0], "%-4d", h0); - fprintf(del_ext_f_arr[0], "%-4d", h0); - for (j = 0; LIKELY(j < end); ++j) // 遍历query字符序列 - { - fprintf(score_f_arr[0], "%-4d", MAX(h0 - o_ins - (j + 1) * e_ins, 0)); - fprintf(ins_ext_f_arr[0], "%-4d", MAX(h0 - o_ins - (j + 1) * e_ins, 0)); - fprintf(del_ext_f_arr[0], "%-4d", 0); - } - fprintf(score_f_arr[0], "\n"); - fprintf(ins_ext_f_arr[0], "\n"); - fprintf(del_ext_f_arr[0], "\n"); -#endif - for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 - { - int t, f = 0, h1, m = 0, mj = -1; - // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 - int8_t *q = &qp[target[i] * qlen]; - // apply the band and the constraint (if provided) - if (beg < i - w) // 检查开始点是否可以缩小一些 - beg = i - w; - if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 - end = i + w + 1; - if (end > qlen) // 终点不超过query长度 - end = qlen; - beg = 0; - end = qlen; - // compute the first column - if (beg == 0) - { - h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列,query从第一个字符开始匹配第i个target字符 - if (h1 < 0) - h1 = 0; - } - else - h1 = 0; -#ifdef DEBUG_OUT - fprintf(ins_ext_f_arr[0], "%-4d", 0); - fprintf(del_ext_f_arr[0], "%-4d", MAX(h0 - o_del - (i + 1) * e_del, 0)); -#endif - for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列 - { -#ifdef DEBUG_OUT - fprintf(score_f_arr[0], "%-4d", h1); - fprintf(ins_ext_f_arr[0], "%-4d", f); - fprintf(del_ext_f_arr[0], "%-4d", eh[j].e); -#endif - // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) - // Similar to SSE2-SW, cells are computed in the following order: - // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete,只消耗target - // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert,只消耗query,target的row id固定,query的col index一直增加 - eh_t *p = &eh[j]; - int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值 - p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果 - M = M ? M + q[j] : 0; // separating H and M to disallow a cigar like "100M3I3D20M" // M大于0则当前两个字符进行匹配,无论是否相等,将分值加到M上,此时M可能变为负数 - h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 // e和f保证是非负的,所以h肯定非负,即使M可能是负数,因为h取e,f和M的最大值 - h = h > f ? h : f; - h1 = h; // save H(i,j) to h1 for the next column // 用h1来保存当前表格(i,j)对应的分值,用来下次计算 - mj = m > h ? mj : j; // record the position where max score is achieved // 记录取得最大值时query的字符位置 - m = m > h ? m : h; // m is stored at eh[mj+1] 因为eh[mj+1]->h表示的是H(i, mj),及上一轮记录的h - t = M - oe_del; // 用来计算delete,假设当前字符(i,j)匹配(无论match还是mismatch),target下一个字符串被空消耗(delete)的分值F(i+1, j) - t = t > 0 ? t : 0; - e -= e_del; // 假设当前query字符 - e = e > t ? e : t; // computed E(i+1,j) // t表示(i,j)强行匹配,(i+1, j)是delete的分数,此前e表示(i+1,j)继续delete的分数 - p->e = e; // save E(i+1,j) for the next row - t = M - oe_ins; - t = t > 0 ? t : 0; - f -= e_ins; - f = f > t ? f : t; // computed F(i,j+1) - } -#ifdef DEBUG_OUT - fprintf(score_f_arr[0], "%-4d", h1); - fprintf(score_f_arr[0], "\n"); - fprintf(ins_ext_f_arr[0], "\n"); - fprintf(del_ext_f_arr[0], "\n"); -#endif - eh[end].h = h1; // end是query序列之外的位置 - eh[end].e = 0; - if (j == qlen) // 此轮遍历到了query的最后一个字符 - { - max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时,target字符串的位置 - gscore = gscore > h1 ? gscore : h1; - } - // if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环 - // break; - if (m > max) // 当前轮最大分值大于之前的最大分值 - { - max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 - max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值 - } - else if (0) //(zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 - { - if (i - max_i > mj - max_j) - { - if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -(从取得最大值的点出发,当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop - break; - } - else - { - if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) // 同上,不过这次是insert,可能是说明有很多mismatch - break; - } - } - // update beg and end for the next round - // for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) - // ; - // beg = j; - // for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) - // ; - // end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert - // beg = 0, end = qlen; // uncomment this line for debugging - // fprintf(stderr, "\n"); - // fprintf(stderr, "%d\n", end); - } - // free(eh); - // free(qp); - thread_mem_release(tmem, qlen * m + (qlen + 1) * 8); - if (_qle) - *_qle = max_j + 1; - if (_tle) - *_tle = max_i + 1; - if (_gtle) - *_gtle = max_ie + 1; - if (_gscore) - *_gscore = gscore; - if (_max_off) - *_max_off = max_off; - return max; -} diff --git a/main.c b/main.c index 866ae44..e7f1fa7 100644 --- a/main.c +++ b/main.c @@ -14,7 +14,7 @@ #include #include "sys/time.h" #include "thread_mem.h" -#include "ksw_ext.h" +#include "bsw.h" #include "utils.h" #include "common.h" @@ -38,47 +38,8 @@ int64_t get_mseconds() int64_t time_sw[KERNEL_NUM] = {0}; #endif -#ifdef DEBUG_RETURN_VALUE -#define OUTPUT_RETVAL_1(kernel_num) \ - fprintf(retval_f_arr[kernel_num], "%d\t%d\t%d\t%d\t", score[kernel_num], info_arr[i][0], info_arr[i][1], info_arr[i][2]); \ - for (j = cur_query_pos - info_arr[i][0]; j < cur_query_pos; ++j) \ - { \ - fprintf(retval_f_arr[kernel_num], "%c", t_2bit2char[(uint8_t)query_arr[j]]); \ - } \ - fprintf(retval_f_arr[kernel_num], "\t"); \ - for (j = cur_target_pos - info_arr[i][1]; j < cur_target_pos; ++j) \ - { \ - fprintf(retval_f_arr[kernel_num], "%c", t_2bit2char[(uint8_t)target_arr[j]]); \ - } \ - fprintf(retval_f_arr[kernel_num], "\n"); -#define OUTPUT_RETVAL_SCORE \ - fprintf(retval_f_arr[kernel_num], "%d\t%d\t%d\t%d\t%d\t%d\n", score[kernel_num], qle, tle, gtle, gscore, max_off[0]) -// fprintf(retval_f_arr[kernel_num], " %d %d\n", cur_query_pos, info_arr[i][0]); -// fprintf(retval_f_arr[kernel_num], "%d\t%d\t%d\t%d\n", score[kernel_num], info_arr[i][0], info_arr[i][1], info_arr[i][2]) - -#define OUTPUT_RETVAL_target(kernel_num) \ - for (j = cur_target_pos - info_arr[i][1]; j < cur_target_pos; ++j) \ - { \ - fprintf(retval_f_arr[kernel_num], "%c", t_2bit2char[(uint8_t)target_arr[j]]); \ - } \ - fprintf(retval_f_arr[kernel_num], "\n"); - -#define OUTPUT_RETVAL_query(kernel_num) \ - for (j = cur_query_pos - info_arr[i][0]; j < cur_query_pos; ++j) \ - { \ - fprintf(retval_f_arr[kernel_num], "%c", t_2bit2char[(uint8_t)query_arr[j]]); \ - } \ - fprintf(retval_f_arr[kernel_num], "\n"); - -#define OUTPUT_RETVAL_INFO(kernel_num) \ - fprintf(retval_f_arr[kernel_num], "%-8d%-8d%-8d\n", info_arr[i][0], info_arr[i][1], info_arr[i][2]); -#define OUTPUT_RETVAL(kernel_num) OUTPUT_RETVAL_target(kernel_num) -#else -#define OUTPUT_RETVAL(kernel_num) -#endif - -#define _PERFORMANCE_TEST_NORMAL(kernel_num, func) \ +#define _PERFORMANCE_TEST(kernel_num, func) \ cur_query_pos = 0; \ cur_target_pos = 0; \ for (i = 0; i < block_line_num; ++i) \ @@ -95,52 +56,16 @@ int64_t time_sw[KERNEL_NUM] = {0}; score_total[kernel_num] += score[kernel_num]; \ cur_query_pos += info_arr[i][0]; \ cur_target_pos += info_arr[i][1]; \ - OUTPUT_RETVAL(0); \ } -#define _PERFORMANCE_TEST_AVX2(kernel_num, func) \ - cur_query_pos = 0; \ - cur_target_pos = 0; \ - for (i = 0; i < block_line_num; ++i) \ - { \ - score[kernel_num] = func( \ - &tmem[kernel_num], \ - info_arr[i][0], \ - (uint8_t *)query_arr + cur_query_pos, \ - info_arr[i][1], \ - (uint8_t *)target_arr + cur_target_pos, \ - 0, 6, 1, 6, 1, \ - 1, 4, \ - 100, 5, \ - info_arr[i][2], \ - &qle, &tle, >le, &gscore, &max_off[0]); \ - score_total[kernel_num] += score[kernel_num]; \ - cur_query_pos += info_arr[i][0]; \ - cur_target_pos += info_arr[i][1]; \ - OUTPUT_RETVAL(kernel_num); \ - } -#define _PERFORMANCE_TEST_AVX2_T1(kernel_num, func) \ - cur_query_pos = 0; \ - cur_target_pos = 0; \ - for (i = 0; i < block_line_num; ++i) \ - { \ - cur_query_pos += info_arr[i][0]; \ - cur_target_pos += info_arr[i][1]; \ - OUTPUT_RETVAL(kernel_num); \ - } + #ifdef SHOW_PERF -#define PERFORMANCE_TEST_NORMAL(kernel_num, func) \ - start_time = get_mseconds(); \ - _PERFORMANCE_TEST_NORMAL(kernel_num, func); \ - time_sw[kernel_num] += get_mseconds() - start_time - -#define PERFORMANCE_TEST_AVX2(kernel_num, func) \ - start_time = get_mseconds(); \ - _PERFORMANCE_TEST_AVX2_T1(kernel_num, func); \ +#define PERFORMANCE_TEST(kernel_num, func) \ + start_time = get_mseconds(); \ + _PERFORMANCE_TEST(kernel_num, func); \ time_sw[kernel_num] += get_mseconds() - start_time #else -#define PERFORMANCE_TEST_NORMAL(kernel_num, func) _PERFORMANCE_TEST_NORMAL(kernel_num, func) -#define PERFORMANCE_TEST_AVX2(kernel_num, func) _PERFORMANCE_TEST_AVX2(kernel_num, func) +#define PERFORMANCE_TEST(kernel_num, func) _PERFORMANCE_TEST(kernel_num, func) #endif // 读取一行序列数据 @@ -245,6 +170,7 @@ int main(int argc, char *argv[]) j += 3; } + int64_t all_qlen = 0; while (!feof(target_f)) { block_line_num = 0; // 记录每次读取的行数 @@ -284,6 +210,7 @@ int main(int argc, char *argv[]) } // 读info + cur_read_size = 0; for (i = 0; i < block_line_num; ++i) { @@ -295,54 +222,52 @@ int main(int argc, char *argv[]) cur_read_size += line_size; // fprintf(stderr, "%-8d%-8d%-8d\n", info_arr[i][0], info_arr[i][1], info_arr[i][2]); // fprintf(stderr, "%s\n", read_buf); + all_qlen += info_arr[i][0]; } +#ifdef DEBUG_RETURN_VALUE cur_read_size = 0; for (i = 0; i < block_line_num; ++i) { + fprintf(stdout, "%d\n", i); for (j = cur_read_size; j < cur_read_size + info_arr[i][1]; ++j) { + fprintf(stdout, "j: %d\n", j); fprintf(retval_f_arr[0], "%c", t_2bit2char[(uint8_t)target_arr[j]]); } fprintf(retval_f_arr[0], "\n"); cur_read_size += info_arr[i][1]; } +#endif // for (i = 0; i < block_line_num; ++i) //{ // fprintf(retval_f_arr[0], "%d\n", info_arr[i][1]); //} // 性能测试 - +#if 1 // normal sw - // PERFORMANCE_TEST_NORMAL(0, ksw_extend_normal); + PERFORMANCE_TEST(0, normal); + + // normal pruning + PERFORMANCE_TEST(1, normal_pruning); // avx2 - // PERFORMANCE_TEST_AVX2(1, ksw_extend_avx2); - // avx2 heuristics - // PERFORMANCE_TEST_AVX2(2, ksw_extend_avx2_heuristics); - // avx2 mem aligned - // PERFORMANCE_TEST_AVX2(3, ksw_extend_avx2_aligned); - - // avx2 u8 - // PERFORMANCE_TEST_AVX2(4, ksw_extend_avx2_u8); - // avx2 u8 heuristics - // PERFORMANCE_TEST_AVX2(5, ksw_extend_avx2_u8_heuristics); - // avx2 u8 mem aligned - // PERFORMANCE_TEST_AVX2(6, ksw_extend_avx2_u8_aligned); + PERFORMANCE_TEST(2, avx2_u8); + // avx2 pruning + PERFORMANCE_TEST(3, avx2_u8_pruning); +#endif } fprintf(stderr, "%ld\n", total_line_num); + fprintf(stderr, "all_qlen: %ld\n", all_qlen); #ifdef SHOW_PERF - char *kernel_names[7] = { + char *kernel_names[4] = { "normal", - "avx2", - "avx2_heuristics", - "avx2_aligned", + "normal_pruning", "avx2_u8", - "avx2_u8_heuristics", - "avx2_u8_aligned"}; + "avx2_u8_pruning"}; for (i = 0; i < KERNEL_NUM; ++i) { diff --git a/normal.c b/normal.c new file mode 100644 index 0000000..f548089 --- /dev/null +++ b/normal.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include "thread_mem.h" +#include "common.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +typedef struct +{ + int32_t h, e; +} eh_t; + +int normal(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +{ + // return h0; + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + assert(h0 > 0); + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + //qp = thread_mem_request(tmem, qlen * m); + //eh = thread_mem_request_and_clean(tmem, (qlen + 1) * 8); + // generate the query profile + //fprintf(stdout, "%d\n", h0); + + for (k = i = 0; k < m; ++k) + { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) + qp[i++] = p[query[j]]; // 对于qp数组,第0到qlen个元素,表示和A比对的分值,第qlen到2*qlen,表示和C比对的分值,以此类推 + } + // fill the first row + // 初始化第一行分值 + eh[0].h = h0; + eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j) + eh[j].h = eh[j - 1].h - e_ins; + // adjust $w if it is too large + k = m * m; // 字符矩阵 + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i] ? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); // 最大可插入的长度? + max_ins = max_ins > 1 ? max_ins : 1; + w = w < max_ins ? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1 ? max_del : 1; + w = w < max_del ? w : max_del; // TODO: is this necessary? 上述几行代码都是为了看看能否缩小窗口,减小计算 + // DP loop + max = h0, max_i = max_j = -1; + max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 + { + int t, f = 0, h1, m = 0, mj = -1; + // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 + int8_t *q = &qp[target[i] * qlen]; + // apply the band and the constraint (if provided) + if (beg < i - w) // 检查开始点是否可以缩小一些 + beg = i - w; + if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 + end = i + w + 1; + if (end > qlen) // 终点不超过query长度 + end = qlen; + // compute the first column + if (beg == 0) + { + h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列,query从第一个字符开始匹配第i个target字符 + if (h1 < 0) + h1 = 0; + } + else + h1 = 0; + + for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列 + { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete,只消耗target + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert,只消耗query,target的row id固定,query的col index一直增加 + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值 + p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果 + M = M ? M + q[j] : 0; // separating H and M to disallow a cigar like "100M3I3D20M" // M大于0则当前两个字符进行匹配,无论是否相等,将分值加到M上,此时M可能变为负数 + h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 // e和f保证是非负的,所以h肯定非负,即使M可能是负数,因为h取e,f和M的最大值 + h = h > f ? h : f; + h1 = h; // save H(i,j) to h1 for the next column // 用h1来保存当前表格(i,j)对应的分值,用来下次计算 + mj = m > h ? mj : j; // record the position where max score is achieved // 记录取得最大值时query的字符位置 + m = m > h ? m : h; // m is stored at eh[mj+1] 因为eh[mj+1]->h表示的是H(i, mj),及上一轮记录的h + t = M - oe_del; // 用来计算delete,假设当前字符(i,j)匹配(无论match还是mismatch),target下一个字符串被空消耗(delete)的分值F(i+1, j) + t = t > 0 ? t : 0; + e -= e_del; // 假设当前query字符 + e = e > t ? e : t; // computed E(i+1,j) // t表示(i,j)强行匹配,(i+1, j)是delete的分数,此前e表示(i+1,j)继续delete的分数 + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0 ? t : 0; + f -= e_ins; + f = f > t ? f : t; // computed F(i,j+1) + } + eh[end].h = h1; // end是query序列之外的位置 + eh[end].e = 0; + if (j == qlen) // 此轮遍历到了query的最后一个字符 + { + max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时,target字符串的位置 + gscore = gscore > h1 ? gscore : h1; + } + //if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环 + // break; + if (m > max) // 当前轮最大分值大于之前的最大分值 + { + max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 + max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值 + } + else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 + { +#if 0 + if (i - max_i > mj - max_j) + { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -(从取得最大值的点出发,当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop + break; + } + else + { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) // 同上,不过这次是insert,可能是说明有很多mismatch + break; + } +#endif + } + // update beg and end for the next round + //for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) + // ; + //beg = j; + //for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) + // ; + //end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert + beg = 0, end = qlen; // uncomment this line for debugging + } + free(eh); + free(qp); + // thread_mem_release(tmem, qlen * m + (qlen + 1) * 8); + if (_qle) + *_qle = max_j + 1; + if (_tle) + *_tle = max_i + 1; + if (_gtle) + *_gtle = max_ie + 1; + if (_gscore) + *_gscore = gscore; + if (_max_off) + *_max_off = max_off; + return max; +} diff --git a/normal_pruning.c b/normal_pruning.c new file mode 100644 index 0000000..d57b59a --- /dev/null +++ b/normal_pruning.c @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include "thread_mem.h" +#include "common.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +typedef struct +{ + int32_t h, e; +} eh_t; + +int normal_pruning(thread_mem_t *tmem, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +{ + // return h0; + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + assert(h0 > 0); + // qp = malloc(qlen * m); + // eh = calloc(qlen + 1, 8); + qp = thread_mem_request(tmem, qlen * m); + eh = thread_mem_request_and_clean(tmem, (qlen + 1) * 8); + // generate the query profile + for (k = i = 0; k < m; ++k) + { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) + qp[i++] = p[query[j]]; // 对于qp数组,第0到qlen个元素,表示和A比对的分值,第qlen到2*qlen,表示和C比对的分值,以此类推 + } + // fill the first row + // 初始化第一行分值 + eh[0].h = h0; + eh[1].h = h0 > oe_ins ? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j - 1].h > e_ins; ++j) + eh[j].h = eh[j - 1].h - e_ins; + // adjust $w if it is too large + k = m * m; // 字符矩阵 + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i] ? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); // 最大可插入的长度? + max_ins = max_ins > 1 ? max_ins : 1; + w = w < max_ins ? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1 ? max_del : 1; + w = w < max_del ? w : max_del; // TODO: is this necessary? 上述几行代码都是为了看看能否缩小窗口,减小计算 + // DP loop + max = h0, max_i = max_j = -1; + max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + for (i = 0; LIKELY(i < tlen); ++i) // 对target逐个字符进行遍历 + { + int t, f = 0, h1, m = 0, mj = -1; + // 对于target第i个字符,query中每个字符的分值,只有匹配和不匹配 + int8_t *q = &qp[target[i] * qlen]; + // apply the band and the constraint (if provided) + if (beg < i - w) // 检查开始点是否可以缩小一些 + beg = i - w; + if (end > i + w + 1) // 检查终点是否可以缩小,使得整体的遍历范围缩小 + end = i + w + 1; + if (end > qlen) // 终点不超过query长度 + end = qlen; + // compute the first column + if (beg == 0) + { + h1 = h0 - (o_del + e_del * (i + 1)); // 只消耗了target序列,query从第一个字符开始匹配第i个target字符 + if (h1 < 0) + h1 = 0; + } + else + h1 = 0; + + for (j = beg; LIKELY(j < end); ++j) // 遍历query字符序列 + { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // E表示delete,只消耗target + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape // F表示insert,只消耗query,target的row id固定,query的col index一直增加 + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) // 获取上一轮h值和e值 + p->h = h1; // set H(i,j-1) for the next row // h1是上一轮计算的结果 + M = M ? M + q[j] : 0; // separating H and M to disallow a cigar like "100M3I3D20M" // M大于0则当前两个字符进行匹配,无论是否相等,将分值加到M上,此时M可能变为负数 + h = M > e ? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 // e和f保证是非负的,所以h肯定非负,即使M可能是负数,因为h取e,f和M的最大值 + h = h > f ? h : f; + h1 = h; // save H(i,j) to h1 for the next column // 用h1来保存当前表格(i,j)对应的分值,用来下次计算 + mj = m > h ? mj : j; // record the position where max score is achieved // 记录取得最大值时query的字符位置 + m = m > h ? m : h; // m is stored at eh[mj+1] 因为eh[mj+1]->h表示的是H(i, mj),及上一轮记录的h + t = M - oe_del; // 用来计算delete,假设当前字符(i,j)匹配(无论match还是mismatch),target下一个字符串被空消耗(delete)的分值F(i+1, j) + t = t > 0 ? t : 0; + e -= e_del; // 假设当前query字符 + e = e > t ? e : t; // computed E(i+1,j) // t表示(i,j)强行匹配,(i+1, j)是delete的分数,此前e表示(i+1,j)继续delete的分数 + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0 ? t : 0; + f -= e_ins; + f = f > t ? f : t; // computed F(i,j+1) + } + eh[end].h = h1; // end是query序列之外的位置 + eh[end].e = 0; + if (j == qlen) // 此轮遍历到了query的最后一个字符 + { + max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时,target字符串的位置 + gscore = gscore > h1 ? gscore : h1; + } + if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环 + break; + if (m > max) // 当前轮最大分值大于之前的最大分值 + { + max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 + max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值 + } + else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 + { +#if 0 + if (i - max_i > mj - max_j) + { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) // 之前最大分值 -(从取得最大值的点出发,当前的delete总长度对应的分值 + 当前轮取得的最大值) > zdrop + break; + } + else + { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) // 同上,不过这次是insert,可能是说明有很多mismatch + break; + } +#endif + } + // update beg and end for the next round + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) + ; + beg = j; + for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) + ; + end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert + // beg = 0, end = qlen; // uncomment this line for debugging + } + // free(eh); + // free(qp); + thread_mem_release(tmem, qlen * m + (qlen + 1) * 8); + if (_qle) + *_qle = max_j + 1; + if (_tle) + *_tle = max_i + 1; + if (_gtle) + *_gtle = max_ie + 1; + if (_gscore) + *_gscore = gscore; + if (_max_off) + *_max_off = max_off; + return max; +} diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..2759ebe --- /dev/null +++ b/run.sh @@ -0,0 +1,5 @@ +#!/bin/bash +#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_0.fa /home/zzh/work/sw_perf/ext_out/t_0.fa /home/zzh/work/sw_perf/ext_out/i_0.txt +#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_1.fa /home/zzh/work/sw_perf/ext_out/t_1.fa /home/zzh/work/sw_perf/ext_out/i_1.txt +/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_2.fa /home/zzh/work/sw_perf/ext_out/t_2.fa /home/zzh/work/sw_perf/ext_out/i_2.txt +#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/ext_out/q_3.fa /home/zzh/work/sw_perf/ext_out/t_3.fa /home/zzh/work/sw_perf/ext_out/i_3.txt diff --git a/run_all.sh b/run_all.sh deleted file mode 100755 index 312b1ae..0000000 --- a/run_all.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -/home/zzh/work/sw_perf/sw_perf /home/zzh/data/sw/query.fa /home/zzh/data/sw/target.fa /home/zzh/data/sw/info.txt \ No newline at end of file diff --git a/run_debug.sh b/run_debug.sh deleted file mode 100755 index 7e78b2f..0000000 --- a/run_debug.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -#/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/input/q.fa /home/zzh/work/sw_perf/input/t.fa /home/zzh/work/sw_perf/input/i.txt -/home/zzh/work/sw_perf/sw_perf /home/zzh/work/sw_perf/input/q_d.fa /home/zzh/work/sw_perf/input/t_d.fa /home/zzh/work/sw_perf/input/i_d.txt - diff --git a/run_l.sh b/run_l.sh deleted file mode 100755 index 02285a8..0000000 --- a/run_l.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -/home/zzh/work/sw_perf/sw_perf /home/zzh/data/sw/q_l.fa /home/zzh/data/sw/t_l.fa /home/zzh/data/sw/i_l.txt \ No newline at end of file diff --git a/run_m.sh b/run_m.sh deleted file mode 100755 index b936788..0000000 --- a/run_m.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -/home/zzh/work/sw_perf/sw_perf /home/zzh/data/sw/q_m.fa /home/zzh/data/sw/t_m.fa /home/zzh/data/sw/i_m.txt \ No newline at end of file diff --git a/run_s.sh b/run_s.sh deleted file mode 100755 index d31fc52..0000000 --- a/run_s.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -/home/zzh/work/sw_perf/sw_perf /home/zzh/data/sw/q_s.fa /home/zzh/data/sw/t_s.fa /home/zzh/data/sw/i_s.txt \ No newline at end of file