#include #include #include "align.h" #include "utils.h" kswr_t align_avx2_i16(byte_mem_t *bmem, kswq_avx2_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) { int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m256i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; kswr_t r; #define __max_16(ret, xx) \ do { \ int16_t *maxVal = (int16_t *)&(xx); \ (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 8)); \ (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 4)); \ (xx) = _mm256_max_epi16((xx), _mm256_srli_si256((xx), 2)); \ (ret) = MAX(maxVal[0], maxVal[8]); \ } while (0) // Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000 #define allzero_16(xx) (!_mm256_movemask_epi8((xx))) #define avx2_slli_i16(xx, imm) \ do { \ int16_t *arr = (int16_t *)&(xx); \ int16_t val = arr[7]; \ (xx) = _mm256_slli_si256((xx), (imm)); \ arr[8] = val; \ } while (0) // initialization r = g_defr; minsc = (xtra & KSW_XSUBO) ? xtra & 0xffff : 0x10000; endsc = (xtra & KSW_XSTOP) ? xtra & 0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm256_set1_epi32(0); oe_del = _mm256_set1_epi16(_o_del + _e_del); e_del = _mm256_set1_epi16(_e_del); oe_ins = _mm256_set1_epi16(_o_ins + _e_ins); e_ins = _mm256_set1_epi16(_e_ins); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { _mm256_store_si256(E + i, zero); _mm256_store_si256(H0 + i, zero); _mm256_store_si256(Hmax + i, zero); } // the core loop for (i = 0; i < tlen; ++i) { int j, k, imax; __m256i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector h = _mm256_load_si256(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example avx2_slli_i16(h, 2); for (j = 0; LIKELY(j < slen); ++j) { h = _mm256_adds_epi16(h, _mm256_load_si256(S++)); e = _mm256_load_si256(E + j); h = _mm256_max_epi16(h, e); h = _mm256_max_epi16(h, f); max = _mm256_max_epi16(max, h); _mm256_store_si256(H1 + j, h); e = _mm256_subs_epu16(e, e_del); t = _mm256_subs_epu16(h, oe_del); e = _mm256_max_epi16(e, t); _mm256_store_si256(E + j, e); f = _mm256_subs_epu16(f, e_ins); t = _mm256_subs_epu16(h, oe_ins); f = _mm256_max_epi16(f, t); h = _mm256_load_si256(H0 + j); } for (k = 0; LIKELY(k < 16); ++k) { avx2_slli_i16(f, 2); for (j = 0; LIKELY(j < slen); ++j) { h = _mm256_load_si256(H1 + j); h = _mm256_max_epi16(h, f); _mm256_store_si256(H1 + j, h); h = _mm256_subs_epu16(h, oe_ins); f = _mm256_subs_epu16(f, e_ins); if (UNLIKELY(allzero_16(_mm256_cmpgt_epi16(f, h)))) goto end_loop16; } } end_loop16: __max_16(imax, max); if (imax >= minsc) { if (n_b == 0 || (int32_t)b[n_b - 1] + 1 != i) { if (n_b == m_b) { m_b = m_b ? m_b << 1 : 8; b = (uint64_t *)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax << 32 | i; } else if ((int)(b[n_b - 1] >> 32) < imax) b[n_b - 1] = (uint64_t)imax << 32 | i; // modify the last } if (imax > gmax) { gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) _mm256_store_si256(Hmax + j, _mm256_load_si256(H1 + j)); if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; } r.score = gmax; r.te = te; { int max = -1, tmp, low, high, qlen = slen * 16; uint16_t *t = (uint16_t *)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; if ((e < low || e > high) && (int)(b[i] >> 32) > r.score2) r.score2 = b[i] >> 32, r.te2 = e; } } } free(b); return r; }