加了对find_max剪枝的优化

This commit is contained in:
zzh 2023-08-20 02:16:25 +08:00
parent c9e4d9645c
commit 775297a813
5 changed files with 288 additions and 142 deletions

View File

@ -1,5 +1,6 @@
{
"files.associations": {
"functional": "c"
"functional": "c",
"random": "c"
}
}

View File

@ -20,6 +20,7 @@
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define SIMD_WIDTH 16
/* 去掉多余计算的值 */
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
@ -38,28 +39,28 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}};
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
#define permute_mask 27
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(a); \
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
__m256i amb_vec = _mm256_set1_epi16(4); \
for (i = 0; i < SIMD_WIDTH; ++i) \
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec, last_max_vec = _mm256_set1_epi16(h0); \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(a); \
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
__m256i amb_vec = _mm256_set1_epi16(4); \
for (i = 0; i < SIMD_WIDTH; ++i) \
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
/*
@ -79,19 +80,21 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
#define SIMD_CMP_SEQ \
/* 将待比对的target序列逆序排列 */ \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
/* 计算模棱两可的字符N)的位置的分数 */ \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
// 向量化计算h, e, f, m
@ -113,8 +116,18 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
// 存储向量化结果
// #define SIMD_STORE
// __m256i cur_max_vec = _mm256_max_epu8(max_vec, hn_vec); \
// __m256i vcmp = _mm256_cmpgt_epi8(cur_max_vec, max_vec); \
// uint32_t mask = _mm256_movemask_epi8(vcmp); \
// if (mask > 0) \
// { \
// simd_i = i; \
// simd_j = j; \
// }
#define SIMD_STORE \
max_vec = _mm256_max_epi16(max_vec, hn_vec); \
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
@ -128,28 +141,43 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t *)&max_vec; \
m = maxVal[0]; \
if (m > 0) \
{ \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
} \
} \
// last_max_vec = max_vec;
#define SIMD_FIND_MAX \
__m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \
uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \
if (cmp_result > 0) \
{ \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t *)&max_vec; \
m = maxVal[0]; \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \
{ \
if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \
{ \
m += a; \
} \
else \
{ \
break; \
} \
} \
} \
} \
last_max_vec = _mm256_set1_epi16(m); \
}
// 每轮迭代后,交换数组
@ -202,6 +230,7 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
int simd_i = -1, simd_j = -1;
SIMD_INIT; // 初始化simd用的数据
@ -365,8 +394,8 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
max_ie = gscore > hA2[qlen] ? max_ie : iStart;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
// if (m == 0 && m_last == 0)
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
// if (m == 0 && m_last == 0)
// break; // 一定要注意,斜对角遍历和按列遍历的不同点
if (m > max)
{
max = m, max_i = mi, max_j = mj;
@ -387,30 +416,124 @@ int bsw_avx2(int qlen, // query length 待匹配段碱基的query
}
// 调整计算的边界
/* for (j = beg; LIKELY(j <= end); ++j)
// fprintf(stderr, "beg: %d, end: %d ", beg, end);
/* for (j = beg; j <= end; j += SIMD_WIDTH)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
__m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
__m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
__m256i orvec = _mm256_or_si256(h1, h2);
__m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
uint32_t mask = _mm256_movemask_epi8(vcmp);
if (mask > 0)
{
// vcmp = _mm256_permute4x64_epi64(vcmp, permute_mask);
// vcmp = _mm256_shufflelo_epi16(vcmp, permute_mask);
// vcmp = _mm256_shufflehi_epi16(vcmp, permute_mask);
// mask = _mm256_movemask_epi8(vcmp);
//// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
// int pos = ((__builtin_clz(mask)) >> 1);
// beg = j + pos;
int pos = __builtin_ctz(mask) >> 1;
beg = j + pos;
// if (beg > end)
// beg = end;
// beg = j + pos;
// beg = 0;
break;
}
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
else
hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
*/
beg = 0;
end = qlen; // uncomment this line for debugging
// beg = 0;
// fprintf(stderr, "new beg: %d\n", beg);
// int pos = 0;
// for (j = beg; j <= end; j += SIMD_WIDTH)
//{
// __m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
// __m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
// __m256i orvec = _mm256_or_si256(h1, h2);
// int *val = (int *)&orvec;
// for (i = 0; i < SIMD_WIDTH; ++i)
// if (val[i])
// {
// pos = SIMD_WIDTH - 1 - i;
// break;
// }
//}
// beg = j;
for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
{
break;
}
}
beg = j;
hA2[end + 1] = 0;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
{
break;
}
// else
// hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
/* for (j = end + 1; j >= beg; j -= SIMD_WIDTH) // 没有考虑beg附近且长度小于SIMD_WIDTH的数据
{
__m256i h1 = _mm256_loadu_si256((__m256i *)(&hA1[j - 1]));
__m256i h2 = _mm256_loadu_si256((__m256i *)(&hA2[j]));
__m256i orvec = _mm256_or_si256(h1, h2);
__m256i vcmp = _mm256_cmpgt_epi16(orvec, zero_vec);
uint32_t mask = _mm256_movemask_epi8(vcmp);
if (mask > 0)
{
int pos = __builtin_clz(mask) >> 1;
const int new_end = j + SIMD_WIDTH - pos;
end = new_end <= qlen ? new_end : qlen;
break;
}
else
{
_mm256_storeu_si256((__m256i *)&hA0[j - 1], zero_vec);
}
}
*/
// beg = 0;
// end = qlen; // uncomment this line for debugging
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
// __m256i origin_max_vec = max_vec;
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2));
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4));
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6));
// max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8));
// max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01));
// int16_t *maxVal = (int16_t *)&max_vec;
// max = maxVal[0];
// if (max > 0)
// {
// __m256i vcmp = _mm256_cmpeq_epi16(origin_max_vec, max_vec);
// uint32_t mask = _mm256_movemask_epi8(vcmp);
// if (mask > 0)
// {
// int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1);
// int mj = simd_j - 1 + pos;
// int mi = simd_i - 1 - pos;
// max_i = mi, max_j = mj;
// max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
// }
// }
free(mem);
if (_qle)
*_qle = max_j + 1;

View File

@ -58,12 +58,13 @@ static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
// const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
#define permute_mask 27
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i max_vec, last_max_vec = _mm256_set1_epi8(h0); \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
@ -149,33 +150,51 @@ static const uint8_t reverse_mask[SIMD_WIDTH] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
// cmp_max = _mm256_xor_si256(last_max_vec, cmp_max);
// last_max_vec = _mm256_set1_epi8(m);
// 找最大值和位置
#define SIMD_FIND_MAX \
uint8_t *maxVal = (uint8_t *)&max_vec; \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
m = maxVal[0]; \
if (m > 0) \
{ \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
} \
} \
#define SIMD_FIND_MAX \
__m256i cmp_max = _mm256_max_epu8(max_vec, last_max_vec); \
cmp_max = _mm256_xor_si256(cmp_max, last_max_vec); \
cmp_max = _mm256_cmpeq_epi8(cmp_max, zero_vec); \
uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \
if (cmp_result != 4294967296) \
{ \
uint8_t *maxVal = (uint8_t *)&max_vec; \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
m = maxVal[0]; \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \
{ \
if (seq[mj + 1] == ref[mi + 1 + SIMD_WIDTH]) \
{ \
m += a; \
} \
else \
{ \
break; \
} \
} \
} \
} \
last_max_vec = _mm256_set1_epi8(m); \
}
// 每轮迭代后,交换数组
@ -217,13 +236,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
return h0;
#ifdef SHOW_PERF
extern int64_t time_bsw_init;
extern int64_t time_bsw_main_loop;
extern int64_t time_bsw_find_max;
extern int64_t time_bsw_adjust_bound;
extern int64_t time_compare;
int64_t start_time = get_mseconds();
// extern int64_t time_bsw_init;
// extern int64_t time_bsw_main_loop;
// extern int64_t time_bsw_find_max;
// extern int64_t time_bsw_adjust_bound;
// extern int64_t time_compare;
// int64_t start_time = get_mseconds();
#endif
uint8_t *mA, *hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
uint8_t *seq, *ref;
@ -316,13 +336,13 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
int m_last = 0;
int iend;
#ifdef SHOW_PERF
time_bsw_init += get_mseconds() - start_time;
// time_bsw_init += get_mseconds() - start_time;
#endif
for (D = 1; LIKELY(D < Dloop); ++D)
{
#ifdef SHOW_PERF
start_time = get_mseconds();
// start_time = get_mseconds();
#endif
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen)
@ -395,17 +415,17 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
SIMD_STORE;
}
#ifdef SHOW_PERF
time_bsw_main_loop += get_mseconds() - start_time;
// time_bsw_main_loop += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
// start_time = get_mseconds();
#endif
SIMD_FIND_MAX;
#ifdef SHOW_PERF
time_bsw_find_max += get_mseconds() - start_time;
// time_bsw_find_max += get_mseconds() - start_time;
#endif
#ifdef SHOW_PERF
start_time = get_mseconds();
// start_time = get_mseconds();
#endif
// 注意最后跳出循环j的值
j = end + 1;
@ -444,13 +464,14 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
break;
}
beg = j;
hA2[end + 1] = 0;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
break;
else
hA0[j - 1] = 0;
// else
// hA0[j - 1] = 0;
}
end = j + 1 <= qlen ? j + 1 : qlen;
@ -461,9 +482,9 @@ int ksw_avx2_u8(int qlen, // query length 待匹配段碱基的que
// swap m, h, e, f
SWAP_DATA_POINTER;
#ifdef SHOW_PERF
time_bsw_adjust_bound += get_mseconds() - start_time;
start_time = get_mseconds();
time_compare += get_mseconds() - start_time;
// time_bsw_adjust_bound += get_mseconds() - start_time;
// start_time = get_mseconds();
// time_compare += get_mseconds() - start_time;
#endif
}

View File

@ -18,6 +18,7 @@ typedef struct
int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
{
// return h0;
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
@ -106,14 +107,14 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
max_ie = gscore > h1 ? max_ie : i; // max_ie表示取得全局最大分值时target字符串的位置
gscore = gscore > h1 ? gscore : h1;
}
// if (m == 0) // 遍历完query之后当前轮次的最大分值为0则跳出循环
// break;
if (m == 0) // 遍历完query之后当前轮次的最大分值为0则跳出循环
break;
if (m > max) // 当前轮最大分值大于之前的最大分值
{
max = m, max_i = i, max_j = mj; // 更新取得最大值的target和query的位置
max_off = max_off > abs(mj - i) ? max_off : abs(mj - i); // 取得最大分值时query和target对应字符串坐标的差值
}
else if (0) // zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
else if (zdrop > 0) // 当前轮匹配之后取得的最大分值没有大于之前的最大值而且zdrop值大于0
{
if (i - max_i > mj - max_j)
{
@ -127,15 +128,15 @@ int ksw_normal(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
}
}
// update beg and end for the next round
// for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
// ;
// beg = j;
// for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
// ;
// end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
beg = 0, end = qlen; // uncomment this line for debugging
// fprintf(stderr, "\n");
// fprintf(stderr, "%d\n", end);
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j)
;
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j)
;
end = j + 2 < qlen ? j + 2 : qlen; // 剪枝没考虑f即insert
// beg = 0, end = qlen; // uncomment this line for debugging
// fprintf(stderr, "\n");
// fprintf(stderr, "%d\n", end);
}
free(eh);
free(qp);

20
main.c
View File

@ -127,12 +127,12 @@ int main(int argc, char *argv[])
// const char *qf_path = "/home/zzh/data/sw/q_s.fa";
// const char *tf_path = "/home/zzh/data/sw/t_s.fa";
// const char *if_path = "/home/zzh/data/sw/i_s.txt";
const char *qf_path = "/home/zzh/data/sw/q_m.fa";
const char *tf_path = "/home/zzh/data/sw/t_m.fa";
const char *if_path = "/home/zzh/data/sw/i_m.txt";
// const char *qf_path = "/home/zzh/data/sw/q_l.fa";
// const char *tf_path = "/home/zzh/data/sw/t_l.fa";
// const char *if_path = "/home/zzh/data/sw/i_l.txt";
// const char *qf_path = "/home/zzh/data/sw/q_m.fa";
// const char *tf_path = "/home/zzh/data/sw/t_m.fa";
// const char *if_path = "/home/zzh/data/sw/i_m.txt";
const char *qf_path = "/home/zzh/data/sw/q_l.fa";
const char *tf_path = "/home/zzh/data/sw/t_l.fa";
const char *if_path = "/home/zzh/data/sw/i_l.txt";
// const char *qf_path = "/public/home/zzh/data/sw/query.fa";
// const char *tf_path = "/public/home/zzh/data/sw/target.fa";
// const char *if_path = "/public/home/zzh/data/sw/info.txt";
@ -386,10 +386,10 @@ int main(int argc, char *argv[])
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound) / DIVIDE_BY);
fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
// fprintf(stderr, "time_bsw_main_loop: %f s\n", (time_bsw_main_loop - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_find_max: %f s\n", (time_bsw_find_max - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_bsw_adjust_bound: %f s\n", (time_bsw_adjust_bound - time_compare) / DIVIDE_BY);
// fprintf(stderr, "time_compare: %f s\n", time_compare / DIVIDE_BY);
#endif
if (query_f != 0)