2025-09-18 15:09:48 +08:00
|
|
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
|
|
|
|
#include "align.h"
|
|
|
|
|
|
#include "utils.h"
|
|
|
|
|
|
|
|
|
|
|
|
kswr_t align_avx2_u8(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins,
|
|
|
|
|
|
int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
|
|
|
|
|
{
|
|
|
|
|
|
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
|
|
|
|
|
uint64_t *b;
|
|
|
|
|
|
__m256i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
|
|
|
|
|
|
kswr_t r;
|
|
|
|
|
|
|
|
|
|
|
|
#define __max_32(ret, xx) \
|
|
|
|
|
|
do { \
|
|
|
|
|
|
uint8_t *maxVal = (uint8_t *)&(xx); \
|
|
|
|
|
|
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 8)); \
|
|
|
|
|
|
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 4)); \
|
|
|
|
|
|
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 2)); \
|
|
|
|
|
|
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 1)); \
|
|
|
|
|
|
(ret) = MAX(maxVal[0], maxVal[16]); \
|
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
|
|
// Given entries with arbitrary values, return whether they are all 0x00
|
|
|
|
|
|
#define allzero_32(xx) (_mm256_movemask_epi8(_mm256_cmpeq_epi8((xx), zero)) == 0xffffffff)
|
|
|
|
|
|
|
|
|
|
|
|
#define avx2_slli_wrap(xx, imm) \
|
|
|
|
|
|
do { \
|
|
|
|
|
|
uint8_t *arr = (uint8_t *)&(xx); \
|
|
|
|
|
|
uint8_t val = arr[15]; \
|
|
|
|
|
|
(xx) = _mm256_slli_si256((xx), (imm)); \
|
|
|
|
|
|
arr[16] = val; \
|
|
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
|
|
// initialization
|
|
|
|
|
|
r = g_defr;
|
|
|
|
|
|
minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000;
|
|
|
|
|
|
endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000;
|
|
|
|
|
|
m_b = n_b = 0;
|
|
|
|
|
|
b = 0;
|
|
|
|
|
|
zero = _mm256_set1_epi32(0);
|
|
|
|
|
|
oe_del = _mm256_set1_epi8(_o_del + _e_del);
|
|
|
|
|
|
e_del = _mm256_set1_epi8(_e_del);
|
|
|
|
|
|
oe_ins = _mm256_set1_epi8(_o_ins + _e_ins);
|
|
|
|
|
|
e_ins = _mm256_set1_epi8(_e_ins);
|
|
|
|
|
|
shift = _mm256_set1_epi8(q->shift);
|
|
|
|
|
|
H0 = q->H0;
|
|
|
|
|
|
H1 = q->H1;
|
|
|
|
|
|
E = q->E;
|
|
|
|
|
|
Hmax = q->Hmax;
|
|
|
|
|
|
slen = q->slen;
|
|
|
|
|
|
for (i = 0; i < slen; ++i) {
|
|
|
|
|
|
_mm256_store_si256(E + i, zero);
|
|
|
|
|
|
_mm256_store_si256(H0 + i, zero);
|
|
|
|
|
|
_mm256_store_si256(Hmax + i, zero);
|
|
|
|
|
|
}
|
|
|
|
|
|
// the core loop
|
|
|
|
|
|
for (i = 0; i < tlen; ++i) {
|
|
|
|
|
|
int j, k, imax;
|
|
|
|
|
|
__m256i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
|
|
|
|
|
h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
|
|
|
|
|
avx2_slli_wrap(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
|
|
|
|
|
|
for (j = 0; LIKELY(j < slen); ++j) {
|
|
|
|
|
|
/* SW cells are computed in the following order:
|
|
|
|
|
|
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
|
|
|
|
|
|
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
|
|
|
|
|
|
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
|
|
|
|
|
|
*/
|
|
|
|
|
|
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
|
|
|
|
|
|
h = _mm256_adds_epu8(h, _mm256_load_si256(S + j));
|
|
|
|
|
|
h = _mm256_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
|
|
|
|
|
|
e = _mm256_load_si256(E + j); // e=E'(i,j)
|
|
|
|
|
|
h = _mm256_max_epu8(h, e);
|
|
|
|
|
|
h = _mm256_max_epu8(h, f); // h=H'(i,j)
|
|
|
|
|
|
max = _mm256_max_epu8(max, h); // set max
|
|
|
|
|
|
_mm256_store_si256(H1 + j, h); // save to H'(i,j)
|
|
|
|
|
|
// now compute E'(i+1,j)
|
|
|
|
|
|
e = _mm256_subs_epu8(e, e_del); // e=E'(i,j) - e_del
|
|
|
|
|
|
t = _mm256_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
|
|
|
|
|
|
e = _mm256_max_epu8(e, t); // e=E'(i+1,j)
|
|
|
|
|
|
_mm256_store_si256(E + j, e); // save to E'(i+1,j)
|
|
|
|
|
|
// now compute F'(i,j+1)
|
|
|
|
|
|
f = _mm256_subs_epu8(f, e_ins);
|
|
|
|
|
|
t = _mm256_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
|
|
|
|
|
|
f = _mm256_max_epu8(f, t);
|
|
|
|
|
|
// get H'(i-1,j) and prepare for the next j
|
|
|
|
|
|
h = _mm256_load_si256(H0 + j); // h=H'(i-1,j)
|
|
|
|
|
|
}
|
|
|
|
|
|
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
|
|
|
|
|
|
for (k = 0; LIKELY(k < 32); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
|
|
|
|
|
|
avx2_slli_wrap(f, 1);
|
|
|
|
|
|
for (j = 0; LIKELY(j < slen); ++j) {
|
|
|
|
|
|
h = _mm256_load_si256(H1 + j);
|
|
|
|
|
|
h = _mm256_max_epu8(h, f); // h=H'(i,j)
|
|
|
|
|
|
_mm256_store_si256(H1 + j, h);
|
|
|
|
|
|
h = _mm256_subs_epu8(h, oe_ins);
|
|
|
|
|
|
f = _mm256_subs_epu8(f, e_ins);
|
|
|
|
|
|
if (UNLIKELY(allzero_32(_mm256_subs_epu8(f, h))))
|
|
|
|
|
|
goto end_loop32;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
end_loop32:
|
|
|
|
|
|
__max_32(imax, max); // imax is the maximum number in max
|
|
|
|
|
|
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
|
|
|
|
|
|
if (n_b == 0 ||
|
|
|
|
|
|
(int32_t)b[n_b - 1] + 1 !=
|
|
|
|
|
|
i) { // then append 新的max用新的数值记录,用来记录次优分值,不能与最优分值相邻(否则就是连续匹配的)
|
|
|
|
|
|
if (n_b == m_b) {
|
|
|
|
|
|
m_b = m_b ? m_b << 1 : 8;
|
|
|
|
|
|
b = (uint64_t *)realloc(b, 8 * m_b);
|
|
|
|
|
|
}
|
|
|
|
|
|
b[n_b++] = (uint64_t)imax << 32 | i;
|
|
|
|
|
|
} else if ((int)(b[n_b - 1] >> 32) < imax)
|
|
|
|
|
|
b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last
|
|
|
|
|
|
}
|
|
|
|
|
|
if (imax > gmax) {
|
|
|
|
|
|
gmax = imax;
|
|
|
|
|
|
te = i; // te is the end position on the target
|
|
|
|
|
|
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
|
|
|
|
|
|
_mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j));
|
|
|
|
|
|
if (gmax + q->shift >= 255 || gmax >= endsc)
|
|
|
|
|
|
break; // 最大分值溢出了,结束运算
|
|
|
|
|
|
}
|
|
|
|
|
|
S = H1;
|
|
|
|
|
|
H1 = H0;
|
|
|
|
|
|
H0 = S; // swap H0 and H1
|
|
|
|
|
|
}
|
|
|
|
|
|
// fprintf(gf[0], "%d\n", i);
|
|
|
|
|
|
r.score = gmax + q->shift < 255 ? gmax : 255;
|
|
|
|
|
|
r.te = te;
|
|
|
|
|
|
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
|
|
|
|
|
|
int max = -1, tmp, low, high, qlen = slen * 32;
|
|
|
|
|
|
uint8_t *t = (uint8_t *)Hmax;
|
|
|
|
|
|
for (i = 0; i < qlen; ++i, ++t)
|
|
|
|
|
|
if ((int)*t > max)
|
|
|
|
|
|
max = *t, r.qe = i / 32 + i % 32 * slen;
|
|
|
|
|
|
else if ((int)*t == max && (tmp = i / 32 + i % 32 * slen) < r.qe)
|
|
|
|
|
|
r.qe = tmp;
|
|
|
|
|
|
if (b) {
|
|
|
|
|
|
i = (r.score + q->max - 1) / q->max;
|
|
|
|
|
|
low = te - i;
|
|
|
|
|
|
high = te + i;
|
|
|
|
|
|
for (i = 0; i < n_b; ++i) {
|
|
|
|
|
|
int e = (int32_t)b[i];
|
|
|
|
|
|
if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2)
|
|
|
|
|
|
r.score2 = b[i] >> 32, r.te2 = e;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
free(b);
|
|
|
|
|
|
return r;
|
|
|
|
|
|
}
|