加了对find_max剪枝的优化
This commit is contained in:
parent
c9e4d9645c
commit
775297a813
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"functional": "c"
|
||||
"functional": "c",
|
||||
"random": "c"
|
||||
}
|
||||
}
|
||||
271
bsw_avx2.c
271
bsw_avx2.c
|
|
@ -20,6 +20,7 @@
|
|||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#define SIMD_WIDTH 16
|
||||
|
||||
/* 去掉多余计算的值 */
|
||||
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
||||
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
|
|
@ -38,28 +39,28 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
|
||||
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}};
|
||||
|
||||
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
|
||||
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
|
||||
// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
|
||||
#define permute_mask 27
|
||||
// 初始化变量
|
||||
#define SIMD_INIT \
|
||||
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
|
||||
__m256i zero_vec; \
|
||||
__m256i max_vec; \
|
||||
__m256i oe_del_vec; \
|
||||
__m256i oe_ins_vec; \
|
||||
__m256i e_del_vec; \
|
||||
__m256i e_ins_vec; \
|
||||
__m256i h_vec_mask[SIMD_WIDTH]; \
|
||||
zero_vec = _mm256_setzero_si256(); \
|
||||
oe_del_vec = _mm256_set1_epi16(-oe_del); \
|
||||
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
|
||||
e_del_vec = _mm256_set1_epi16(-e_del); \
|
||||
e_ins_vec = _mm256_set1_epi16(-e_ins); \
|
||||
__m256i match_sc_vec = _mm256_set1_epi16(a); \
|
||||
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
|
||||
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
|
||||
__m256i amb_vec = _mm256_set1_epi16(4); \
|
||||
for (i = 0; i < SIMD_WIDTH; ++i) \
|
||||
#define SIMD_INIT \
|
||||
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
|
||||
__m256i zero_vec; \
|
||||
__m256i max_vec, last_max_vec = _mm256_set1_epi16(h0); \
|
||||
__m256i oe_del_vec; \
|
||||
__m256i oe_ins_vec; \
|
||||
__m256i e_del_vec; \
|
||||
__m256i e_ins_vec; \
|
||||
__m256i h_vec_mask[SIMD_WIDTH]; \
|
||||
zero_vec = _mm256_setzero_si256(); \
|
||||
oe_del_vec = _mm256_set1_epi16(-oe_del); \
|
||||
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
|
||||
e_del_vec = _mm256_set1_epi16(-e_del); \
|
||||
e_ins_vec = _mm256_set1_epi16(-e_ins); \
|
||||
__m256i match_sc_vec = _mm256_set1_epi16(a); \
|
||||
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
|
||||
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
|
||||
__m256i amb_vec = _mm256_set1_epi16(4); \
|
||||
for (i = 0; i < SIMD_WIDTH; ++i) \
|
||||
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
|
||||
|
||||
/*
|
||||
|
|
@ -79,19 +80,21 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
|
||||
|
||||
// 比对ref和seq的序列,计算罚分
|
||||
#define SIMD_CMP_SEQ \
|
||||
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
|
||||
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
|
||||
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \
|
||||
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \
|
||||
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
|
||||
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
|
||||
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
|
||||
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
|
||||
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
|
||||
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
|
||||
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
|
||||
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
|
||||
#define SIMD_CMP_SEQ \
|
||||
/* 将待比对的target序列逆序排列 */ \
|
||||
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
|
||||
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
|
||||
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \
|
||||
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \
|
||||
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \
|
||||
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \
|
||||
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
|
||||
/* 计算模棱两可的字符(N)的位置的分数 */ \
|
||||
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
|
||||
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
|
||||
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
|
||||
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
|
||||
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
|
||||
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
|
||||
|
||||
// 向量化计算h, e, f, m
|
||||
|
|
@ -113,8 +116,18 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
|
||||
|
||||
// 存储向量化结果
|
||||
// #define SIMD_STORE
|
||||
|
||||
// __m256i cur_max_vec = _mm256_max_epu8(max_vec, hn_vec); \
|
||||
// __m256i vcmp = _mm256_cmpgt_epi8(cur_max_vec, max_vec); \
|
||||
// uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||
// if (mask > 0) \
|
||||
// { \
|
||||
// simd_i = i; \
|
||||
// simd_j = j; \
|
||||
// }
|
||||
#define SIMD_STORE \
|
||||
max_vec = _mm256_max_epi16(max_vec, hn_vec); \
|
||||
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
|
||||
_mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
|
||||
_mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
|
||||
_mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
|
||||
|
|
@ -128,28 +141,43 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
|
||||
|
||||
// 找最大值和位置
|
||||
#define SIMD_FIND_MAX \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
|
||||
int16_t *maxVal = (int16_t *)&max_vec; \
|
||||
m = maxVal[0]; \
|
||||
if (m > 0) \
|
||||
{ \
|
||||
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
|
||||
{ \
|
||||
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
|
||||
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||
if (mask > 0) \
|
||||
{ \
|
||||
int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \
|
||||
mj = j - 1 + pos; \
|
||||
mi = i - 1 - pos; \
|
||||
} \
|
||||
} \
|
||||
// last_max_vec = max_vec;
|
||||
#define SIMD_FIND_MAX \
|
||||
__m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \
|
||||
uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \
|
||||
if (cmp_result > 0) \
|
||||
{ \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
|
||||
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
|
||||
int16_t *maxVal = (int16_t *)&max_vec; \
|
||||
m = maxVal[0]; \
|
||||
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
|
||||
{ \
|
||||
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
|
||||
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||
if (mask > 0) \
|
||||
{ \
|
||||
int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \
|
||||
mj = j - 1 + pos; \
|
||||
mi = i - 1 - pos; \
|
||||
for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \
|
||||
{ \
|
||||
if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \
|
||||
{ \
|
||||
m += a; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
last_max_vec = _mm256_set1_epi16(m); \
|
||||
}
|
||||
|
||||
// 每轮迭代后,交换数组
|
||||
|
|
@ -202,6 +230,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
|
|||
int col_size = qlen + 2 + SIMD_WIDTH;
|
||||
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
|
||||
int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
|
||||
int simd_i = -1, simd_j = -1;
|
||||
|
||||
SIMD_INIT; // 初始化simd用的数据
|
||||
|
||||
|
|
@ -365,8 +394,8 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
|
|||
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
|
||||
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
|
||||
}
|
||||
// if (m == 0 && m_last == 0)
|
||||
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
|
||||
// if (m == 0 && m_last == 0)
|
||||
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
|
||||
if (m > max)
|
||||
{
|
||||
max = m, max_i = mi, max_j = mj;
|
||||
|
|
@ -387,30 +416,124 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
|
|||
}
|
||||
|
||||
// 调整计算的边界
|
||||
/* for (j = beg; LIKELY(j <= end); ++j)
|
||||
|
||||
// fprintf(stderr, "beg: %d, end: %d ", beg, end);
|
||||
|
||||
/* for (j = beg; j <= end; j += SIMD_WIDTH)
|
||||
{
|
||||
int has_val = hA1[j - 1] | hA2[j];
|
||||
if (has_val)
|
||||
__m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
|
||||
__m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
|
||||
__m256i orvec = _mm256_or_si256(h1, h2);
|
||||
__m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp);
|
||||
if (mask > 0)
|
||||
{
|
||||
// vcmp = _mm256_permute4x64_epi64(vcmp, permute_mask);
|
||||
// vcmp = _mm256_shufflelo_epi16(vcmp, permute_mask);
|
||||
// vcmp = _mm256_shufflehi_epi16(vcmp, permute_mask);
|
||||
// mask = _mm256_movemask_epi8(vcmp);
|
||||
//// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
|
||||
// int pos = ((__builtin_clz(mask)) >> 1);
|
||||
// beg = j + pos;
|
||||
int pos = __builtin_ctz(mask) >> 1;
|
||||
beg = j + pos;
|
||||
// if (beg > end)
|
||||
// beg = end;
|
||||
// beg = j + pos;
|
||||
// beg = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
beg = j;
|
||||
for (j = end + 1; LIKELY(j >= beg); --j)
|
||||
{
|
||||
int has_val = hA1[j - 1] | hA2[j];
|
||||
if (has_val)
|
||||
break;
|
||||
else
|
||||
hA0[j - 1] = 0;
|
||||
}
|
||||
end = j + 1 <= qlen ? j + 1 : qlen;
|
||||
*/
|
||||
beg = 0;
|
||||
end = qlen; // uncomment this line for debugging
|
||||
// beg = 0;
|
||||
// fprintf(stderr, "new beg: %d\n", beg);
|
||||
// int pos = 0;
|
||||
// for (j = beg; j <= end; j += SIMD_WIDTH)
|
||||
//{
|
||||
// __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
|
||||
// __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
|
||||
// __m256i orvec = _mm256_or_si256(h1, h2);
|
||||
// int *val = (int *)&orvec;
|
||||
// for (i = 0; i < SIMD_WIDTH; ++i)
|
||||
// if (val[i])
|
||||
// {
|
||||
// pos = SIMD_WIDTH - 1 - i;
|
||||
// break;
|
||||
// }
|
||||
//}
|
||||
// beg = j;
|
||||
for (j = beg; LIKELY(j <= end); ++j)
|
||||
{
|
||||
int has_val = hA1[j - 1] | hA2[j];
|
||||
if (has_val)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
beg = j;
|
||||
|
||||
hA2[end + 1] = 0;
|
||||
for (j = end + 1; LIKELY(j >= beg); --j)
|
||||
{
|
||||
int has_val = hA1[j - 1] | hA2[j];
|
||||
if (has_val)
|
||||
{
|
||||
break;
|
||||
}
|
||||
// else
|
||||
// hA0[j - 1] = 0;
|
||||
}
|
||||
end = j + 1 <= qlen ? j + 1 : qlen;
|
||||
|
||||
/* for (j = end + 1; j >= beg; j -= SIMD_WIDTH) // 没有考虑beg附近,且长度小于SIMD_WIDTH的数据
|
||||
{
|
||||
__m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
|
||||
__m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
|
||||
__m256i orvec = _mm256_or_si256(h1, h2);
|
||||
__m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp);
|
||||
if (mask > 0)
|
||||
{
|
||||
int pos = __builtin_clz(mask) >> 1;
|
||||
const int new_end = j + SIMD_WIDTH - pos;
|
||||
end = new_end <= qlen ? new_end : qlen;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm256_storeu_si256((__m256i *)&hA0[j - 1], zero_vec);
|
||||
}
|
||||
}
|
||||
*/
|
||||
// beg = 0;
|
||||
// end = qlen; // uncomment this line for debugging
|
||||
m_last = m;
|
||||
// swap m, h, e, f
|
||||
SWAP_DATA_POINTER;
|
||||
}
|
||||
|
||||
// __m256i origin_max_vec = max_vec;
|
||||
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));
|
||||
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));
|
||||
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));
|
||||
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));
|
||||
// max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01));
|
||||
// int16_t *maxVal = (int16_t *)&max_vec;
|
||||
// max = maxVal[0];
|
||||
// if (max > 0)
|
||||
// {
|
||||
// __m256i vcmp = _mm256_cmpeq_epi16(origin_max_vec, max_vec);
|
||||
// uint32_t mask = _mm256_movemask_epi8(vcmp);
|
||||
// if (mask > 0)
|
||||
// {
|
||||
// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
|
||||
// int mj = simd_j - 1 + pos;
|
||||
// int mi = simd_i - 1 - pos;
|
||||
// max_i = mi, max_j = mj;
|
||||
// max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
|
||||
// }
|
||||
// }
|
||||
|
||||
free(mem);
|
||||
if (_qle)
|
||||
*_qle = max_j + 1;
|
||||
|
|
|
|||
111
ksw_avx2_u8.c
111
ksw_avx2_u8.c
|
|
@ -58,12 +58,13 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
|
||||
|
||||
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
|
||||
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
|
||||
// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
|
||||
#define permute_mask 27
|
||||
// 初始化变量
|
||||
#define SIMD_INIT \
|
||||
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
|
||||
__m256i zero_vec; \
|
||||
__m256i max_vec; \
|
||||
__m256i max_vec, last_max_vec = _mm256_set1_epi8(h0); \
|
||||
__m256i oe_del_vec; \
|
||||
__m256i oe_ins_vec; \
|
||||
__m256i e_del_vec; \
|
||||
|
|
@ -149,33 +150,51 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
|
|||
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
|
||||
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
|
||||
|
||||
// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max);
|
||||
// last_max_vec = _mm256_set1_epi8(m);
|
||||
// 找最大值和位置
|
||||
#define SIMD_FIND_MAX \
|
||||
uint8_t *maxVal = (uint8_t *)&max_vec; \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
|
||||
m = maxVal[0]; \
|
||||
if (m > 0) \
|
||||
{ \
|
||||
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
|
||||
{ \
|
||||
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
|
||||
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||
if (mask > 0) \
|
||||
{ \
|
||||
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
|
||||
mj = j - 1 + pos; \
|
||||
mi = i - 1 - pos; \
|
||||
} \
|
||||
} \
|
||||
#define SIMD_FIND_MAX \
|
||||
__m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec); \
|
||||
cmp_max = _mm256_xor_si256(cmp_max, last_max_vec); \
|
||||
cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec); \
|
||||
uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \
|
||||
if (cmp_result != 4294967296) \
|
||||
{ \
|
||||
uint8_t *maxVal = (uint8_t *)&max_vec; \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
|
||||
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
|
||||
m = maxVal[0]; \
|
||||
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
|
||||
{ \
|
||||
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
|
||||
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
|
||||
uint32_t mask = _mm256_movemask_epi8(vcmp); \
|
||||
if (mask > 0) \
|
||||
{ \
|
||||
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
|
||||
mj = j - 1 + pos; \
|
||||
mi = i - 1 - pos; \
|
||||
for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \
|
||||
{ \
|
||||
if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \
|
||||
{ \
|
||||
m += a; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
last_max_vec = _mm256_set1_epi8(m); \
|
||||
}
|
||||
|
||||
// 每轮迭代后,交换数组
|
||||
|
|
@ -217,13 +236,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
|
|||
int *_gscore, // query的端到端匹配得分
|
||||
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
|
||||
{
|
||||
return h0;
|
||||
#ifdef SHOW_PERF
|
||||
extern int64_t time_bsw_init;
|
||||
extern int64_t time_bsw_main_loop;
|
||||
extern int64_t time_bsw_find_max;
|
||||
extern int64_t time_bsw_adjust_bound;
|
||||
extern int64_t time_compare;
|
||||
int64_t start_time = get_mseconds();
|
||||
// extern int64_t time_bsw_init;
|
||||
// extern int64_t time_bsw_main_loop;
|
||||
// extern int64_t time_bsw_find_max;
|
||||
// extern int64_t time_bsw_adjust_bound;
|
||||
// extern int64_t time_compare;
|
||||
// int64_t start_time = get_mseconds();
|
||||
#endif
|
||||
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H,其他的保存上个H E F M
|
||||
uint8_t *seq, *ref;
|
||||
|
|
@ -316,13 +336,13 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
|
|||
int m_last = 0;
|
||||
int iend;
|
||||
#ifdef SHOW_PERF
|
||||
time_bsw_init += get_mseconds() - start_time;
|
||||
// time_bsw_init += get_mseconds() - start_time;
|
||||
#endif
|
||||
|
||||
for (D = 1; LIKELY(D < Dloop); ++D)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
start_time = get_mseconds();
|
||||
// start_time = get_mseconds();
|
||||
#endif
|
||||
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
|
||||
if (D > tlen)
|
||||
|
|
@ -395,17 +415,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
|
|||
SIMD_STORE;
|
||||
}
|
||||
#ifdef SHOW_PERF
|
||||
time_bsw_main_loop += get_mseconds() - start_time;
|
||||
// time_bsw_main_loop += get_mseconds() - start_time;
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
start_time = get_mseconds();
|
||||
// start_time = get_mseconds();
|
||||
#endif
|
||||
SIMD_FIND_MAX;
|
||||
#ifdef SHOW_PERF
|
||||
time_bsw_find_max += get_mseconds() - start_time;
|
||||
// time_bsw_find_max += get_mseconds() - start_time;
|
||||
#endif
|
||||
#ifdef SHOW_PERF
|
||||
start_time = get_mseconds();
|
||||
// start_time = get_mseconds();
|
||||
#endif
|
||||
// 注意最后跳出循环j的值
|
||||
j = end + 1;
|
||||
|
|
@ -444,13 +464,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
|
|||
break;
|
||||
}
|
||||
beg = j;
|
||||
hA2[end + 1] = 0;
|
||||
for (j = end + 1; LIKELY(j >= beg); --j)
|
||||
{
|
||||
int has_val = hA1[j - 1] | hA2[j];
|
||||
if (has_val)
|
||||
break;
|
||||
else
|
||||
hA0[j - 1] = 0;
|
||||
// else
|
||||
// hA0[j - 1] = 0;
|
||||
}
|
||||
end = j + 1 <= qlen ? j + 1 : qlen;
|
||||
|
||||
|
|
@ -461,9 +482,9 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
|
|||
// swap m, h, e, f
|
||||
SWAP_DATA_POINTER;
|
||||
#ifdef SHOW_PERF
|
||||
time_bsw_adjust_bound += get_mseconds() - start_time;
|
||||
start_time = get_mseconds();
|
||||
time_compare += get_mseconds() - start_time;
|
||||
// time_bsw_adjust_bound += get_mseconds() - start_time;
|
||||
// start_time = get_mseconds();
|
||||
// time_compare += get_mseconds() - start_time;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
25
ksw_normal.c
25
ksw_normal.c
|
|
@ -18,6 +18,7 @@ typedef struct
|
|||
|
||||
int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
|
||||
{
|
||||
// return h0;
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
|
||||
|
|
@ -106,14 +107,14 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时,target字符串的位置
|
||||
gscore = gscore > h1 ? gscore : h1;
|
||||
}
|
||||
// if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环
|
||||
// break;
|
||||
if (m == 0) // 遍历完query之后,当前轮次的最大分值为0,则跳出循环
|
||||
break;
|
||||
if (m > max) // 当前轮最大分值大于之前的最大分值
|
||||
{
|
||||
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
|
||||
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时,query和target对应字符串坐标的差值
|
||||
}
|
||||
else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0
|
||||
else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值,而且zdrop值大于0
|
||||
{
|
||||
if (i - max_i > mj - max_j)
|
||||
{
|
||||
|
|
@ -127,15 +128,15 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
|
|||
}
|
||||
}
|
||||
// update beg and end for the next round
|
||||
// for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
|
||||
// ;
|
||||
// beg = j;
|
||||
// for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
|
||||
// ;
|
||||
// end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert
|
||||
beg = 0, end = qlen; // uncomment this line for debugging
|
||||
// fprintf(stderr, "\n");
|
||||
// fprintf(stderr, "%d\n", end);
|
||||
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
|
||||
;
|
||||
beg = j;
|
||||
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
|
||||
;
|
||||
end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f,即insert
|
||||
// beg = 0, end = qlen; // uncomment this line for debugging
|
||||
// fprintf(stderr, "\n");
|
||||
// fprintf(stderr, "%d\n", end);
|
||||
}
|
||||
free(eh);
|
||||
free(qp);
|
||||
|
|
|
|||
20
main.c
20
main.c
|
|
@ -127,12 +127,12 @@ int main(int argc, char *argv[])
|
|||
// const char *qf_path = "/home/zzh/data/sw/q_s.fa";
|
||||
// const char *tf_path = "/home/zzh/data/sw/t_s.fa";
|
||||
// const char *if_path = "/home/zzh/data/sw/i_s.txt";
|
||||
const char *qf_path = "/home/zzh/data/sw/q_m.fa";
|
||||
const char *tf_path = "/home/zzh/data/sw/t_m.fa";
|
||||
const char *if_path = "/home/zzh/data/sw/i_m.txt";
|
||||
// const char *qf_path = "/home/zzh/data/sw/q_l.fa";
|
||||
// const char *tf_path = "/home/zzh/data/sw/t_l.fa";
|
||||
// const char *if_path = "/home/zzh/data/sw/i_l.txt";
|
||||
// const char *qf_path = "/home/zzh/data/sw/q_m.fa";
|
||||
// const char *tf_path = "/home/zzh/data/sw/t_m.fa";
|
||||
// const char *if_path = "/home/zzh/data/sw/i_m.txt";
|
||||
const char *qf_path = "/home/zzh/data/sw/q_l.fa";
|
||||
const char *tf_path = "/home/zzh/data/sw/t_l.fa";
|
||||
const char *if_path = "/home/zzh/data/sw/i_l.txt";
|
||||
// const char *qf_path = "/public/home/zzh/data/sw/query.fa";
|
||||
// const char *tf_path = "/public/home/zzh/data/sw/target.fa";
|
||||
// const char *if_path = "/public/home/zzh/data/sw/info.txt";
|
||||
|
|
@ -386,10 +386,10 @@ int main(int argc, char *argv[])
|
|||
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop) / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max) / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound) / DIVIDE_BY);
|
||||
fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
|
||||
fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
|
||||
fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
|
||||
fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
|
||||
// fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
|
||||
#endif
|
||||
|
||||
if (query_f != 0)
|
||||
|
|
|
|||
Loading…
Reference in New Issue