On ARM, rewrite SSE2 SIMD calls using Neon intrinsics
Many Intel intrinsics have a corresponding Neon equivalent. Other cases are more interesting: * Neon's vmaxvq directly selects the maximum entry in a vector, so can be used to implement both the __max_16/__max_8 macros and the _mm_movemask_epi8 early loop exit. Introduce additional helper macros alongside __max_16/__max_8 so that the early loop exit can similarly be implemented differently on the two platforms. * Full-width shifts can be done via vextq. This is defined close to the ksw_u8()/ksw_i16() functions (rather than in neon_sse.h) as it implicitly uses one of their local variables. * ksw_i16() uses saturating *signed* 16-bit operations apart from _mm_subs_epu16; presumably the data is effectively still signed but we wish to keep it non-negative. The ARM intrinsics are more careful about type checking, so this requires an extra U16() helper macro.
This commit is contained in:
parent
b5f4bdae91
commit
b64ccddda7
2
Makefile
2
Makefile
|
|
@ -78,7 +78,7 @@ fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
||||||
is.o: malloc_wrap.h
|
is.o: malloc_wrap.h
|
||||||
kopen.o: malloc_wrap.h
|
kopen.o: malloc_wrap.h
|
||||||
kstring.o: kstring.h malloc_wrap.h
|
kstring.o: kstring.h malloc_wrap.h
|
||||||
ksw.o: ksw.h malloc_wrap.h
|
ksw.o: ksw.h neon_sse.h malloc_wrap.h
|
||||||
main.o: kstring.h malloc_wrap.h utils.h
|
main.o: kstring.h malloc_wrap.h utils.h
|
||||||
malloc_wrap.o: malloc_wrap.h
|
malloc_wrap.o: malloc_wrap.h
|
||||||
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
|
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
|
||||||
|
|
|
||||||
34
ksw.c
34
ksw.c
|
|
@ -26,7 +26,11 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#if defined __x86_64__
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
#elif defined __ARM_NEON
|
||||||
|
#include "neon_sse.h"
|
||||||
|
#endif
|
||||||
#include "ksw.h"
|
#include "ksw.h"
|
||||||
|
|
||||||
#ifdef USE_MALLOC_WRAPPERS
|
#ifdef USE_MALLOC_WRAPPERS
|
||||||
|
|
@ -108,6 +112,11 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined __ARM_NEON
|
||||||
|
// This macro implicitly uses each function's `zero` local variable
|
||||||
|
#define _mm_slli_si128(a, n) (vextq_u8(zero, (a), 16 - (n)))
|
||||||
|
#endif
|
||||||
|
|
||||||
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
|
||||||
{
|
{
|
||||||
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
||||||
|
|
@ -115,6 +124,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
|
||||||
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
|
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
|
||||||
kswr_t r;
|
kswr_t r;
|
||||||
|
|
||||||
|
#if defined __x86_64__
|
||||||
#define __max_16(ret, xx) do { \
|
#define __max_16(ret, xx) do { \
|
||||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
|
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
|
||||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
|
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
|
||||||
|
|
@ -123,6 +133,14 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
|
||||||
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
|
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
// Given entries with arbitrary values, return whether they are all 0x00
|
||||||
|
#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
|
||||||
|
|
||||||
|
#elif defined __ARM_NEON
|
||||||
|
#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
|
||||||
|
#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
// initialization
|
// initialization
|
||||||
r = g_defr;
|
r = g_defr;
|
||||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||||
|
|
@ -143,7 +161,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
|
||||||
}
|
}
|
||||||
// the core loop
|
// the core loop
|
||||||
for (i = 0; i < tlen; ++i) {
|
for (i = 0; i < tlen; ++i) {
|
||||||
int j, k, cmp, imax;
|
int j, k, imax;
|
||||||
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
|
||||||
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
|
||||||
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
|
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
|
||||||
|
|
@ -182,8 +200,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
|
||||||
_mm_store_si128(H1 + j, h);
|
_mm_store_si128(H1 + j, h);
|
||||||
h = _mm_subs_epu8(h, oe_ins);
|
h = _mm_subs_epu8(h, oe_ins);
|
||||||
f = _mm_subs_epu8(f, e_ins);
|
f = _mm_subs_epu8(f, e_ins);
|
||||||
cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
|
if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16;
|
||||||
if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
end_loop16:
|
end_loop16:
|
||||||
|
|
@ -236,6 +253,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
|
||||||
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
|
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
|
||||||
kswr_t r;
|
kswr_t r;
|
||||||
|
|
||||||
|
#if defined __x86_64__
|
||||||
#define __max_8(ret, xx) do { \
|
#define __max_8(ret, xx) do { \
|
||||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
||||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
|
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
|
||||||
|
|
@ -243,6 +261,14 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
|
||||||
(ret) = _mm_extract_epi16((xx), 0); \
|
(ret) = _mm_extract_epi16((xx), 0); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
|
||||||
|
#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
|
||||||
|
|
||||||
|
#elif defined __ARM_NEON
|
||||||
|
#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
|
||||||
|
#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
// initialization
|
// initialization
|
||||||
r = g_defr;
|
r = g_defr;
|
||||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||||
|
|
@ -290,7 +316,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
|
||||||
_mm_store_si128(H1 + j, h);
|
_mm_store_si128(H1 + j, h);
|
||||||
h = _mm_subs_epu16(h, oe_ins);
|
h = _mm_subs_epu16(h, oe_ins);
|
||||||
f = _mm_subs_epu16(f, e_ins);
|
f = _mm_subs_epu16(f, e_ins);
|
||||||
if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
|
if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
end_loop8:
|
end_loop8:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
#ifndef NEON_SSE_H
|
||||||
|
#define NEON_SSE_H
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
typedef uint8x16_t __m128i;
|
||||||
|
|
||||||
|
static inline __m128i _mm_load_si128(const __m128i *ptr) { return vld1q_u8((const uint8_t *) ptr); }
|
||||||
|
static inline __m128i _mm_set1_epi32(int n) { return vreinterpretq_u8_s32(vdupq_n_s32(n)); }
|
||||||
|
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { vst1q_u8((uint8_t *) ptr, a); }
|
||||||
|
|
||||||
|
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vqaddq_u8(a, b); }
|
||||||
|
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { return vmaxq_u8(a, b); }
|
||||||
|
static inline __m128i _mm_set1_epi8(int8_t n) { return vreinterpretq_u8_s8(vdupq_n_s8(n)); }
|
||||||
|
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vqsubq_u8(a, b); }
|
||||||
|
|
||||||
|
#define M128I(a) vreinterpretq_u8_s16((a))
|
||||||
|
#define UM128I(a) vreinterpretq_u8_u16((a))
|
||||||
|
#define S16(a) vreinterpretq_s16_u8((a))
|
||||||
|
#define U16(a) vreinterpretq_u16_u8((a))
|
||||||
|
|
||||||
|
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return M128I(vqaddq_s16(S16(a), S16(b))); }
|
||||||
|
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return UM128I(vcgtq_s16(S16(a), S16(b))); }
|
||||||
|
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return M128I(vmaxq_s16(S16(a), S16(b))); }
|
||||||
|
static inline __m128i _mm_set1_epi16(int16_t n) { return vreinterpretq_u8_s16(vdupq_n_s16(n)); }
|
||||||
|
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { return UM128I(vqsubq_u16(U16(a), U16(b))); }
|
||||||
|
|
||||||
|
#undef M128I
|
||||||
|
#undef UM128I
|
||||||
|
#undef S16
|
||||||
|
#undef U16
|
||||||
|
|
||||||
|
#endif
|
||||||
Loading…
Reference in New Issue