sw_perf/ksw_ext_avx2_aligned.c

444 lines
21 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "thread_mem.h"
#include "common.h"
#ifdef __GNUC__
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#undef MAX
#undef MIN
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define SIMD_WIDTH 16
#define BASE_BYTES 2
#define SCORE_BYTES 2
#define BOUNDARY_SCORE_NUM 2
#define TMP_SCORE_ARRAY_NUM 9
#define MEM_ALIGN_BYTES 32
#define ALIGN_SHIFT_BITS 5
#define SIMD_BYTES 32
#define AMBIGUOUS_BASE_CODE 4
#define AMBIGUOUS_BASE_SCORE -1
// 32字节对齐256位
#define align_mem(x) (((x) + 31) >> 5 << 5)
#define align_number(x) align_mem(x)
/* 去掉多余计算的值 */
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}};
// #define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
#define permute_mask 27
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec, last_max_vec = _mm256_set1_epi16(init_score); \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(base_match_score); \
__m256i mis_sc_vec = _mm256_set1_epi16(-base_mis_score); \
__m256i amb_sc_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_SCORE); \
__m256i amb_vec = _mm256_set1_epi16(AMBIGUOUS_BASE_CODE); \
for (i = 0; i < SIMD_WIDTH; ++i) \
h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
/*
* e 表示当前ref的碱基被删除
* f 表示当前seq的碱基插入
* m 表示当前碱基匹配(可以相等,也可以不想等)
* h 表示最大值
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i *)(&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i *)(&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i *)(&mA1[j - 1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i *)(&fA1[j - 1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i *)(&hA0[j - 1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i *)(&seq[j])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i *)(&ref[i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
/* 将待比对的target序列逆序排列 */ \
ts_vec = _mm256_permute4x64_epi64(ts_vec, permute_mask); \
ts_vec = _mm256_shufflelo_epi16(ts_vec, permute_mask); \
ts_vec = _mm256_shufflehi_epi16(ts_vec, permute_mask); \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); /* 比对query和target字符序列 */ \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); /* 未匹配上的位置赋值mismatch分数 */ \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); /* 匹配上的位置赋值match分数 */ \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
/* 计算模棱两可的字符N)的位置的分数 */ \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \
__m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \
__m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \
__m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \
__m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \
__m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \
__m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \
en_vec = _mm256_max_epi16(en_vec, zero_vec); \
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
#define SIMD_STORE \
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i *)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i *)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i *)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i *)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end - j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end - j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end - j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end - j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
__m256i cmp_max = _mm256_cmpgt_epi16(max_vec, last_max_vec); \
uint32_t cmp_result = _mm256_movemask_epi8(cmp_max); \
if (cmp_result > 0) \
{ \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t *)&max_vec; \
m = maxVal[0]; \
for (j = aligned_read_start_pos, i = aligned_ref_end_pos; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) \
{ \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) \
{ \
int pos = SIMD_WIDTH - 1 - ((__builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
for (; mj + 1 < qlen && mi + 1 < tlen; mj++, mi++) \
{ \
if (seq[mj + 2] == ref[mi + 1 + SIMD_WIDTH]) \
{ \
m += base_match_score; \
} \
else \
{ \
break; \
} \
} \
} \
} \
last_max_vec = _mm256_set1_epi16(m); \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
int16_t *tmp = hA0; \
hA0 = hA1; \
hA1 = hA2; \
hA2 = tmp; \
tmp = eA1; \
eA1 = eA2; \
eA2 = tmp; \
tmp = fA1; \
fA1 = fA2; \
fA2 = tmp; \
tmp = mA1; \
mA1 = mA2; \
mA2 = tmp;
int ksw_extend_avx2_aligned(thread_mem_t *tmem,
int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int extend_left, // 是不是向左扩展
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数SIMD_BTYES
int base_match_score, // 碱基match时的分数
int base_mis_score, // 碱基mismatch时的惩罚分数正数
int window_size, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus, // 如果query比对到了最后一个字符额外奖励分值
int init_score, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
int16_t *mA1, *mA2,
*hA0, *hA1, *hA2,
*eA1, *eA2,
*fA1, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
int16_t *seq, *ref;
uint8_t *mem_addr;
int read_size = align_number(qlen * BASE_BYTES + MEM_ALIGN_BYTES);
int ref_size = align_number((tlen + SIMD_WIDTH) * BASE_BYTES);
int back_diagnal_num = tlen + qlen; // 循环跳出条件 D从1开始遍历
int score_array_size = align_number((qlen + BOUNDARY_SCORE_NUM) * SCORE_BYTES);
int score_element_num = score_array_size / SCORE_BYTES;
int score_mem_size = score_array_size * TMP_SCORE_ARRAY_NUM;
int request_mem_size = read_size + ref_size + score_mem_size + MEM_ALIGN_BYTES * 3; // 左侧内存地址对齐 + 数据向左偏移一个元素 + 末尾SIMD补齐
int i, ibeg, D, j, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int span, beg1, end1; // 边界条件计算
int aligned_read_start_pos, aligned_ref_end_pos;
int iend;
SIMD_INIT; // 初始化simd用的数据
assert(init_score > 0);
// allocate memory
mem_addr = thread_mem_request(tmem, request_mem_size);
mem_addr = (void *)align_mem((uint64_t)mem_addr);
ref = (int16_t *)&mem_addr[0];
seq = (int16_t *)(mem_addr + ref_size + SIMD_BYTES - BASE_BYTES);
if (extend_left)
{
for (i = 0; i < qlen; ++i)
seq[i + 1] = query[qlen - 1 - i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH - 1] = target[tlen - 1 - i];
}
else
{
for (i = 0; i < qlen; ++i)
seq[i + 1] = query[i];
for (i = 0; i < tlen; ++i)
ref[i + SIMD_WIDTH - 1] = target[i];
}
mem_addr += read_size + ref_size + (SIMD_BYTES - SCORE_BYTES);
for (i = 0; i < score_mem_size; i += SIMD_BYTES)
{
_mm256_storeu_si256((__m256i *)&mem_addr[i], zero_vec);
}
hA0 = (int16_t *)&mem_addr[0];
hA1 = &hA0[score_element_num];
hA2 = &hA1[score_element_num];
mA1 = &hA2[score_element_num];
mA2 = &mA1[score_element_num];
eA1 = &mA2[score_element_num];
eA2 = &eA1[score_element_num];
fA1 = &eA2[score_element_num];
fA2 = &fA1[score_element_num];
// adjust $window_size if it is too large
// get the max score
max = base_match_score;
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1 ? max_ins : 1;
window_size = window_size < max_ins ? window_size : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1 ? max_del : 1;
window_size = window_size < max_del ? window_size : max_del; // TODO: is this necessary?
if (tlen < qlen)
window_size = MIN(tlen - 1, window_size);
// DP loop
max = init_score, max_i = max_j = -1;
max_ie = -1, gscore = -1;
;
max_off = 0;
beg = 1;
end = qlen;
// init init_score
hA0[0] = init_score; // 左上角
fA1[1] = MAX(0, init_score - (o_ins + e_ins));
eA2[0] = init_score;
hA1[1] = fA1[1];
if (qlen == 0 || tlen == 0)
back_diagnal_num = 0; // 防止意外情况
if (window_size >= qlen)
{
max_ie = 0;
gscore = 0;
}
for (D = 1; LIKELY(D < back_diagnal_num); ++D)
{
if (D < tlen)
beg1 = 1;
else
beg1 = D - tlen + 1;
if (D < qlen)
end1 = D; // 闭区间
else
end1 = qlen;
beg1 = MAX(D - window_size, beg1);
end1 = MIN(D + window_size, end1);
beg = MAX(beg1, beg);
end = MIN(end1, end);
if (beg > end)
break;
// beg = beg1;
// end = end1;
iend = D - beg; // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
// 处理左边界
if (beg == 1)
{
hA0[0] = eA2[0];
mA1[0] = 0;
eA1[0] = MAX(0, init_score - (o_del + e_del * (iend + 1)));
}
// aligned_read_start_pos = (beg >> ALIGN_SHIFT_BITS << ALIGN_SHIFT_BITS) + 1;
// aligned_ref_end_pos = iend + (beg - aligned_read_start_pos);
aligned_read_start_pos = beg;
aligned_ref_end_pos = iend;
// fprintf(stderr, "%d\t%d\n", beg, aligned_read_start_pos);
for (j = aligned_read_start_pos, i = aligned_ref_end_pos; j <= end + 1 - SIMD_WIDTH; j += SIMD_WIDTH, i -= SIMD_WIDTH)
{
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end)
{
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
// 处理上边界
if (ibeg == 0)
{
fA2[end + 1] = MAX(0, init_score - (o_ins + e_ins * (end + 1)));
hA2[end + 1] = fA2[end + 1];
mA2[end + 1] = 0;
}
SIMD_FIND_MAX;
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1) // 遍历到了query最后一个碱基此时next_max_arr[qlen]为全局匹配的最大分值
{
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m > max)
{
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
// 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
{
break;
}
}
beg = j;
for (j = end + 1; LIKELY(j >= beg); --j)
{
int has_val = hA1[j - 1] | hA2[j];
if (has_val)
{
break;
}
}
end = j + 1 <= qlen ? j + 1 : qlen;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
thread_mem_release(tmem, request_mem_size);
if (_qle)
*_qle = max_j + 1;
if (_tle)
*_tle = max_i + 1;
if (_gtle)
*_gtle = max_ie + 1;
if (_gscore)
*_gscore = gscore;
if (_max_off)
*_max_off = max_off;
return max;
}