diff --git a/.vscode/settings.json b/.vscode/settings.json index b03a764..875fce9 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "files.associations": { - "functional": "c" + "functional": "c", + "random": "c" } } \ No newline at end of file diff --git a/bsw_avx2.c b/bsw_avx2.c index 3c76c0e..44fad1e 100644 --- a/bsw_avx2.c +++ b/bsw_avx2.c @@ -20,6 +20,7 @@ #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define SIMD_WIDTH 16 +/* 去掉多余计算的值 */ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, @@ -38,28 +39,28 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0}, {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}}; -// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); -#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +#define permute_mask 27 // 初始化变量 -#define SIMD_INIT \ - int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ - __m256i zero_vec; \ - __m256i max_vec; \ - __m256i oe_del_vec; \ - __m256i oe_ins_vec; \ - __m256i e_del_vec; \ - __m256i e_ins_vec; \ - __m256i h_vec_mask[SIMD_WIDTH]; \ - zero_vec = _mm256_setzero_si256(); \ - oe_del_vec = _mm256_set1_epi16(-oe_del); \ - oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ - e_del_vec = _mm256_set1_epi16(-e_del); \ - e_ins_vec = _mm256_set1_epi16(-e_ins); \ - __m256i match_sc_vec = _mm256_set1_epi16(a); \ - __m256i mis_sc_vec = _mm256_set1_epi16(-b); \ - __m256i amb_sc_vec = _mm256_set1_epi16(-1); \ - __m256i amb_vec = _mm256_set1_epi16(4); \ - for (i = 0; i < SIMD_WIDTH; ++i) \ +#define SIMD_INIT \ + int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ + __m256i zero_vec; \ + __m256i max_vec, last_max_vec = _mm256_set1_epi16(h0); \ + __m256i oe_del_vec; \ + __m256i oe_ins_vec; \ + __m256i e_del_vec; \ + __m256i e_ins_vec; \ + __m256i h_vec_mask[SIMD_WIDTH]; \ + zero_vec = _mm256_setzero_si256(); \ + oe_del_vec = _mm256_set1_epi16(-oe_del); \ + oe_ins_vec = _mm256_set1_epi16(-oe_ins); \ + e_del_vec = _mm256_set1_epi16(-e_del); \ + e_ins_vec = _mm256_set1_epi16(-e_ins); \ + __m256i match_sc_vec = _mm256_set1_epi16(a); \ + __m256i mis_sc_vec = _mm256_set1_epi16(-b); \ + __m256i amb_sc_vec = _mm256_set1_epi16(-1); \ + __m256i amb_vec = _mm256_set1_epi16(4); \ + for (i = 0; i < SIMD_WIDTH; ++i) \ h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i])); /* @@ -79,19 +80,21 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { __m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i])); // 比对ref和seq的序列,计算罚分 -#define SIMD_CMP_SEQ \ - ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ - ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ - __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \ - __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \ - __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \ - score_vec = _mm256_or_si256(score_vec, mis_score_vec); \ - __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \ - __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \ - __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ - score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \ - __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ +#define SIMD_CMP_SEQ \ + /* 将待比对的target序列逆序排列 */ \ + ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \ + ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \ + ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \ + __m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \ + __m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \ + __m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \ + score_vec = _mm256_or_si256(score_vec, mis_score_vec); \ + /* 计算模棱两可的字符(N)的位置的分数 */ \ + __m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \ + __m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \ + __m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \ + score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \ + __m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \ score_vec = _mm256_or_si256(score_vec, amb_score_vec); // 向量化计算h, e, f, m @@ -113,8 +116,18 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { hn_vec = _mm256_max_epi16(hn_vec, zero_vec); // 存储向量化结果 +// #define SIMD_STORE + +// __m256i cur_max_vec = _mm256_max_epu8(max_vec, hn_vec); \ +// __m256i vcmp = _mm256_cmpgt_epi8(cur_max_vec, max_vec); \ +// uint32_t mask = _mm256_movemask_epi8(vcmp); \ +// if (mask > 0) \ +// { \ +// simd_i = i; \ +// simd_j = j; \ +// } #define SIMD_STORE \ - max_vec = _mm256_max_epi16(max_vec, hn_vec); \ + max_vec = _mm256_max_epu8(max_vec, hn_vec); \ _mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \ _mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \ _mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \ @@ -128,28 +141,43 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); // 找最大值和位置 -#define SIMD_FIND_MAX \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - int16_t *maxVal = (int16_t *)&max_vec; \ - m = maxVal[0]; \ - if (m > 0) \ - { \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - } \ - } \ +// last_max_vec = max_vec; +#define SIMD_FIND_MAX \ + __m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \ + uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ + if (cmp_result > 0) \ + { \ + max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ + max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ + max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ + max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ + max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ + int16_t *maxVal = (int16_t *)&max_vec; \ + m = maxVal[0]; \ + for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ + { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) \ + { \ + int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ + { \ + if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \ + { \ + m += a; \ + } \ + else \ + { \ + break; \ + } \ + } \ + } \ + } \ + last_max_vec = _mm256_set1_epi16(m); \ } // 每轮迭代后,交换数组 @@ -202,6 +230,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query int col_size = qlen + 2 + SIMD_WIDTH; int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍 int mem_size = (seq_size + ref_size) * 2 + val_mem_size; + int simd_i = -1, simd_j = -1; SIMD_INIT; // 初始化simd用的数据 @@ -365,8 +394,8 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query max_ie = gscore > hA2[qlen] ? max_ie : iStart; gscore = gscore > hA2[qlen] ? gscore : hA2[qlen]; } - // if (m == 0 && m_last == 0) - // break; // 一定要注意,斜对角遍历和按列遍历的不同点 + // if (m == 0 && m_last == 0) + // break; // 一定要注意,斜对角遍历和按列遍历的不同点 if (m > max) { max = m, max_i = mi, max_j = mj; @@ -387,30 +416,124 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query } // 调整计算的边界 - /* for (j = beg; LIKELY(j <= end); ++j) + + // fprintf(stderr, "beg: %d, end: %d ", beg, end); + + /* for (j = beg; j <= end; j += SIMD_WIDTH) { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) + __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1])); + __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j])); + __m256i orvec = _mm256_or_si256(h1, h2); + __m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec); + uint32_t mask = _mm256_movemask_epi8(vcmp); + if (mask > 0) + { + // vcmp = _mm256_permute4x64_epi64(vcmp, permute_mask); + // vcmp = _mm256_shufflelo_epi16(vcmp, permute_mask); + // vcmp = _mm256_shufflehi_epi16(vcmp, permute_mask); + // mask = _mm256_movemask_epi8(vcmp); + //// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); + // int pos = ((__builtin_clz(mask)) >> 1); + // beg = j + pos; + int pos = __builtin_ctz(mask) >> 1; + beg = j + pos; + // if (beg > end) + // beg = end; + // beg = j + pos; + // beg = 0; break; + } } - beg = j; - for (j = end + 1; LIKELY(j >= beg); --j) - { - int has_val = hA1[j - 1] | hA2[j]; - if (has_val) - break; - else - hA0[j - 1] = 0; - } - end = j + 1 <= qlen ? j + 1 : qlen; */ - beg = 0; - end = qlen; // uncomment this line for debugging + // beg = 0; + // fprintf(stderr, "new beg: %d\n", beg); + // int pos = 0; + // for (j = beg; j <= end; j += SIMD_WIDTH) + //{ + // __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1])); + // __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j])); + // __m256i orvec = _mm256_or_si256(h1, h2); + // int *val = (int *)&orvec; + // for (i = 0; i < SIMD_WIDTH; ++i) + // if (val[i]) + // { + // pos = SIMD_WIDTH - 1 - i; + // break; + // } + //} + // beg = j; + for (j = beg; LIKELY(j <= end); ++j) + { + int has_val = hA1[j - 1] | hA2[j]; + if (has_val) + { + break; + } + } + beg = j; + + hA2[end + 1] = 0; + for (j = end + 1; LIKELY(j >= beg); --j) + { + int has_val = hA1[j - 1] | hA2[j]; + if (has_val) + { + break; + } + // else + // hA0[j - 1] = 0; + } + end = j + 1 <= qlen ? j + 1 : qlen; + + /* for (j = end + 1; j >= beg; j -= SIMD_WIDTH) // 没有考虑beg附近,且长度小于SIMD_WIDTH的数据 + { + __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1])); + __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j])); + __m256i orvec = _mm256_or_si256(h1, h2); + __m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec); + uint32_t mask = _mm256_movemask_epi8(vcmp); + if (mask > 0) + { + int pos = __builtin_clz(mask) >> 1; + const int new_end = j + SIMD_WIDTH - pos; + end = new_end <= qlen ? new_end : qlen; + break; + } + else + { + _mm256_storeu_si256((__m256i *)&hA0[j - 1], zero_vec); + } + } + */ + // beg = 0; + // end = qlen; // uncomment this line for debugging m_last = m; // swap m, h, e, f SWAP_DATA_POINTER; } + // __m256i origin_max_vec = max_vec; + // max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); + // max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); + // max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); + // max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); + // max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); + // int16_t *maxVal = (int16_t *)&max_vec; + // max = maxVal[0]; + // if (max > 0) + // { + // __m256i vcmp = _mm256_cmpeq_epi16(origin_max_vec, max_vec); + // uint32_t mask = _mm256_movemask_epi8(vcmp); + // if (mask > 0) + // { + // int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); + // int mj = simd_j - 1 + pos; + // int mi = simd_i - 1 - pos; + // max_i = mi, max_j = mj; + // max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi); + // } + // } + free(mem); if (_qle) *_qle = max_j + 1; diff --git a/ksw_avx2_u8.c b/ksw_avx2_u8.c index 0b2b7cb..964c028 100644 --- a/ksw_avx2_u8.c +++ b/ksw_avx2_u8.c @@ -58,12 +58,13 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = { static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; // const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3); -#define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3) +#define permute_mask 27 // 初始化变量 #define SIMD_INIT \ int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \ __m256i zero_vec; \ - __m256i max_vec; \ + __m256i max_vec, last_max_vec = _mm256_set1_epi8(h0); \ __m256i oe_del_vec; \ __m256i oe_ins_vec; \ __m256i e_del_vec; \ @@ -149,33 +150,51 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \ hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]); +// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max); +// last_max_vec = _mm256_set1_epi8(m); // 找最大值和位置 -#define SIMD_FIND_MAX \ - uint8_t *maxVal = (uint8_t *)&max_vec; \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ - max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ - m = maxVal[0]; \ - if (m > 0) \ - { \ - for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ - { \ - __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ - __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ - uint32_t mask = _mm256_movemask_epi8(vcmp); \ - if (mask > 0) \ - { \ - int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ - mj = j - 1 + pos; \ - mi = i - 1 - pos; \ - } \ - } \ +#define SIMD_FIND_MAX \ + __m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec); \ + cmp_max = _mm256_xor_si256(cmp_max, last_max_vec); \ + cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec); \ + uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \ + if (cmp_result != 4294967296) \ + { \ + uint8_t *maxVal = (uint8_t *)&max_vec; \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \ + max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \ + m = maxVal[0]; \ + for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \ + { \ + __m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \ + __m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \ + uint32_t mask = _mm256_movemask_epi8(vcmp); \ + if (mask > 0) \ + { \ + int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \ + mj = j - 1 + pos; \ + mi = i - 1 - pos; \ + for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \ + { \ + if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \ + { \ + m += a; \ + } \ + else \ + { \ + break; \ + } \ + } \ + } \ + } \ + last_max_vec = _mm256_set1_epi8(m); \ } // 每轮迭代后,交换数组 @@ -217,13 +236,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que int *_gscore, // query的端到端匹配得分 int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值 { + return h0; #ifdef SHOW_PERF - extern int64_t time_bsw_init; - extern int64_t time_bsw_main_loop; - extern int64_t time_bsw_find_max; - extern int64_t time_bsw_adjust_bound; - extern int64_t time_compare; - int64_t start_time = get_mseconds(); +// extern int64_t time_bsw_init; +// extern int64_t time_bsw_main_loop; +// extern int64_t time_bsw_find_max; +// extern int64_t time_bsw_adjust_bound; +// extern int64_t time_compare; +// int64_t start_time = get_mseconds(); #endif uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M uint8_t *seq, *ref; @@ -316,13 +336,13 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que int m_last = 0; int iend; #ifdef SHOW_PERF - time_bsw_init += get_mseconds() - start_time; +// time_bsw_init += get_mseconds() - start_time; #endif for (D = 1; LIKELY(D < Dloop); ++D) { #ifdef SHOW_PERF - start_time = get_mseconds(); +// start_time = get_mseconds(); #endif // 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况 if (D > tlen) @@ -395,17 +415,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que SIMD_STORE; } #ifdef SHOW_PERF - time_bsw_main_loop += get_mseconds() - start_time; +// time_bsw_main_loop += get_mseconds() - start_time; #endif #ifdef SHOW_PERF - start_time = get_mseconds(); +// start_time = get_mseconds(); #endif SIMD_FIND_MAX; #ifdef SHOW_PERF - time_bsw_find_max += get_mseconds() - start_time; +// time_bsw_find_max += get_mseconds() - start_time; #endif #ifdef SHOW_PERF - start_time = get_mseconds(); +// start_time = get_mseconds(); #endif // 注意最后跳出循环j的值 j = end + 1; @@ -444,13 +464,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que break; } beg = j; + hA2[end + 1] = 0; for (j = end + 1; LIKELY(j >= beg); --j) { int has_val = hA1[j - 1] | hA2[j]; if (has_val) break; - else - hA0[j - 1] = 0; + // else + // hA0[j - 1] = 0; } end = j + 1 <= qlen ? j + 1 : qlen; @@ -461,9 +482,9 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que // swap m, h, e, f SWAP_DATA_POINTER; #ifdef SHOW_PERF - time_bsw_adjust_bound += get_mseconds() - start_time; - start_time = get_mseconds(); - time_compare += get_mseconds() - start_time; +// time_bsw_adjust_bound += get_mseconds() - start_time; +// start_time = get_mseconds(); +// time_compare += get_mseconds() - start_time; #endif } diff --git a/ksw_normal.c b/ksw_normal.c index 282a977..52a2a0c 100644 --- a/ksw_normal.c +++ b/ksw_normal.c @@ -18,6 +18,7 @@ typedef struct int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { + // return h0; eh_t *eh; // score array int8_t *qp; // query profile int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; @@ -106,14 +107,14 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时,target字符串的位置 gscore = gscore > h1 ? gscore : h1; } - // if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环 - // break; + if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环 + break; if (m > max) // 当前轮最大分值大于之前的最大分值 { max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置 max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值 } - else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 + else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0 { if (i - max_i > mj - max_j) { @@ -127,15 +128,15 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, } } // update beg and end for the next round - // for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) - // ; - // beg = j; - // for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) - // ; - // end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert - beg = 0, end = qlen; // uncomment this line for debugging - // fprintf(stderr, "\n"); - // fprintf(stderr, "%d\n", end); + for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j) + ; + beg = j; + for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j) + ; + end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert + // beg = 0, end = qlen; // uncomment this line for debugging + // fprintf(stderr, "\n"); + // fprintf(stderr, "%d\n", end); } free(eh); free(qp); diff --git a/main.c b/main.c index db9e44c..80c9a74 100644 --- a/main.c +++ b/main.c @@ -127,12 +127,12 @@ int main(int argc, char *argv[]) // const char *qf_path = "/home/zzh/data/sw/q_s.fa"; // const char *tf_path = "/home/zzh/data/sw/t_s.fa"; // const char *if_path = "/home/zzh/data/sw/i_s.txt"; - const char *qf_path = "/home/zzh/data/sw/q_m.fa"; - const char *tf_path = "/home/zzh/data/sw/t_m.fa"; - const char *if_path = "/home/zzh/data/sw/i_m.txt"; - // const char *qf_path = "/home/zzh/data/sw/q_l.fa"; - // const char *tf_path = "/home/zzh/data/sw/t_l.fa"; - // const char *if_path = "/home/zzh/data/sw/i_l.txt"; + // const char *qf_path = "/home/zzh/data/sw/q_m.fa"; + // const char *tf_path = "/home/zzh/data/sw/t_m.fa"; + // const char *if_path = "/home/zzh/data/sw/i_m.txt"; + const char *qf_path = "/home/zzh/data/sw/q_l.fa"; + const char *tf_path = "/home/zzh/data/sw/t_l.fa"; + const char *if_path = "/home/zzh/data/sw/i_l.txt"; // const char *qf_path = "/public/home/zzh/data/sw/query.fa"; // const char *tf_path = "/public/home/zzh/data/sw/target.fa"; // const char *if_path = "/public/home/zzh/data/sw/info.txt"; @@ -386,10 +386,10 @@ int main(int argc, char *argv[]) // fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop) / DIVIDE_BY); // fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max) / DIVIDE_BY); // fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound) / DIVIDE_BY); - fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY); - fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY); - fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY); - fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY); + // fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY); + // fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY); + // fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY); + // fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY); #endif if (query_f != 0)