135 lines
4.3 KiB
C
135 lines
4.3 KiB
C
/*********************************************************************************************
|
|
Description: sw align functions in bwa-mem
|
|
|
|
Copyright : All right reserved by NCIC.ICT
|
|
|
|
Author : Zhang Zhonghai
|
|
Date : 2024/04/11
|
|
***********************************************************************************************/
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include "utils.h"
|
|
#include "align.h"
|
|
|
|
kswr_t align_sse_i16(byte_mem_t *bmem, kswq_sse_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
|
{
|
|
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
|
uint64_t *b;
|
|
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
|
|
kswr_t r;
|
|
|
|
#if defined __SSE2__
|
|
#define __max_8(ret, xx) do { \
|
|
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
|
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
|
|
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
|
|
(ret) = _mm_extract_epi16((xx), 0); \
|
|
} while (0)
|
|
|
|
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
|
|
#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
|
|
|
|
#elif defined __ARM_NEON
|
|
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
|
|
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
|
|
|
|
#else
|
|
#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
|
|
#define allzero_0f_8(xx) (m128i_allzero((xx)))
|
|
#endif
|
|
|
|
// initialization
|
|
r = g_defr;
|
|
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
|
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
|
|
m_b = n_b = 0; b = 0;
|
|
zero = _mm_set1_epi32(0);
|
|
oe_del = _mm_set1_epi16(_o_del + _e_del);
|
|
e_del = _mm_set1_epi16(_e_del);
|
|
oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
|
|
e_ins = _mm_set1_epi16(_e_ins);
|
|
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
|
slen = q->slen;
|
|
for (i = 0; i < slen; ++i) {
|
|
_mm_store_si128(E + i, zero);
|
|
_mm_store_si128(H0 + i, zero);
|
|
_mm_store_si128(Hmax + i, zero);
|
|
}
|
|
// the core loop
|
|
for (i = 0; i < tlen; ++i) {
|
|
int j, k, imax;
|
|
__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
|
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
|
h = _mm_slli_si128(h, 2);
|
|
for (j = 0; LIKELY(j < slen); ++j) {
|
|
h = _mm_adds_epi16(h, _mm_load_si128(S++));
|
|
e = _mm_load_si128(E + j);
|
|
h = _mm_max_epi16(h, e);
|
|
h = _mm_max_epi16(h, f);
|
|
max = _mm_max_epi16(max, h);
|
|
_mm_store_si128(H1 + j, h);
|
|
e = _mm_subs_epu16(e, e_del);
|
|
t = _mm_subs_epu16(h, oe_del);
|
|
e = _mm_max_epi16(e, t);
|
|
_mm_store_si128(E + j, e);
|
|
f = _mm_subs_epu16(f, e_ins);
|
|
t = _mm_subs_epu16(h, oe_ins);
|
|
f = _mm_max_epi16(f, t);
|
|
h = _mm_load_si128(H0 + j);
|
|
}
|
|
for (k = 0; LIKELY(k < 16); ++k) {
|
|
f = _mm_slli_si128(f, 2);
|
|
for (j = 0; LIKELY(j < slen); ++j) {
|
|
h = _mm_load_si128(H1 + j);
|
|
h = _mm_max_epi16(h, f);
|
|
_mm_store_si128(H1 + j, h);
|
|
h = _mm_subs_epu16(h, oe_ins);
|
|
f = _mm_subs_epu16(f, e_ins);
|
|
if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
|
|
}
|
|
}
|
|
end_loop8:
|
|
__max_8(imax, max);
|
|
//fprintf(stderr, "%d\n", imax);
|
|
if (imax >= minsc) {
|
|
//fprintf(stderr, "%d\n", imax);
|
|
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
|
|
if (n_b == m_b) {
|
|
m_b = m_b? m_b<<1 : 8;
|
|
b = (uint64_t*)realloc(b, 8 * m_b);
|
|
// b = (uint64_t *)byte_mem_request(bmem, 8 * m_b);
|
|
}
|
|
b[n_b++] = (uint64_t)imax<<32 | i;
|
|
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
|
|
}
|
|
if (imax > gmax) {
|
|
gmax = imax; te = i;
|
|
for (j = 0; LIKELY(j < slen); ++j)
|
|
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
|
|
if (gmax >= endsc) break;
|
|
}
|
|
S = H1; H1 = H0; H0 = S;
|
|
|
|
}
|
|
r.score = gmax; r.te = te;
|
|
{
|
|
int max = -1, tmp, low, high, qlen = slen * 8;
|
|
uint16_t *t = (uint16_t*)Hmax;
|
|
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
|
|
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
|
|
else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
|
|
if (b) {
|
|
i = (r.score + q->max - 1) / q->max;
|
|
low = te - i; high = te + i;
|
|
for (i = 0; i < n_b; ++i) {
|
|
int e = (int32_t)b[i];
|
|
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
|
|
r.score2 = b[i]>>32, r.te2 = e;
|
|
}
|
|
}
|
|
}
|
|
free(b);
|
|
// byte_mem_clear(bmem);
|
|
return r;
|
|
}
|