Pulled Mohammad's changes for creating variable sized arrays
Merge branch 'master' of /home/mghodrat/PairHMM/shared-repository into intel_pairhmm Conflicts: PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
This commit is contained in:
commit
a14a11c0cf
|
|
@ -1,10 +1,10 @@
|
|||
#include "template.h"
|
||||
|
||||
#undef SIMD_TYPE
|
||||
#undef SIMD_TYPE_SSE
|
||||
#undef SIMD_ENGINE
|
||||
#undef SIMD_ENGINE_SSE
|
||||
|
||||
#define SIMD_TYPE avx
|
||||
#define SIMD_TYPE_AVX
|
||||
#define SIMD_ENGINE avx
|
||||
#define SIMD_ENGINE_AVX
|
||||
|
||||
#include "define-float.h"
|
||||
#include "shift_template.c"
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
|
|||
|
||||
Context<NUMBER> ctx;
|
||||
|
||||
NUMBER M[MROWS][MCOLS];
|
||||
NUMBER X[MROWS][MCOLS];
|
||||
NUMBER Y[MROWS][MCOLS];
|
||||
NUMBER p[MROWS][6];
|
||||
NUMBER M[ROWS][COLS];
|
||||
NUMBER X[ROWS][COLS];
|
||||
NUMBER Y[ROWS][COLS];
|
||||
NUMBER p[ROWS][6];
|
||||
|
||||
p[0][MM] = ctx._(0.0);
|
||||
p[0][GapM] = ctx._(0.0);
|
||||
|
|
|
|||
|
|
@ -1,53 +1,51 @@
|
|||
#include <iostream>
|
||||
|
||||
#ifdef PRECISION
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef _256_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef MAVX_COUNT
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef SIMD_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef print256b(__v1)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define PRECISION d
|
||||
|
|
@ -60,128 +58,122 @@
|
|||
#define SHIFT_CONST2 1
|
||||
#define SHIFT_CONST3 8
|
||||
#define _128_TYPE __m128d
|
||||
#define _256_TYPE __m256d
|
||||
#define SIMD_TYPE __m256d
|
||||
#define _256_INT_TYPE __m256i
|
||||
#define AVX_LENGTH 4
|
||||
#define MAVX_COUNT (MROWS+7)/AVX_LENGTH
|
||||
#define HAP_TYPE __m128i
|
||||
#define MASK_TYPE uint64_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF
|
||||
#define MASK_VEC MaskVec_D
|
||||
|
||||
#define SET_VEC_ZERO(__vec) \
|
||||
__vec= _mm256_setzero_pd()
|
||||
__vec= _mm256_setzero_pd()
|
||||
|
||||
#define VEC_OR(__v1, __v2) \
|
||||
_mm256_or_pd(__v1, __v2)
|
||||
_mm256_or_pd(__v1, __v2)
|
||||
|
||||
#define VEC_ADD(__v1, __v2) \
|
||||
_mm256_add_pd(__v1, __v2)
|
||||
_mm256_add_pd(__v1, __v2)
|
||||
|
||||
#define VEC_SUB(__v1, __v2) \
|
||||
_mm256_sub_pd(__v1, __v2)
|
||||
_mm256_sub_pd(__v1, __v2)
|
||||
|
||||
#define VEC_MUL(__v1, __v2) \
|
||||
_mm256_mul_pd(__v1, __v2)
|
||||
_mm256_mul_pd(__v1, __v2)
|
||||
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm256_div_pd(__v1, __v2)
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm256_div_pd(__v1, __v2)
|
||||
|
||||
#define VEC_BLEND(__v1, __v2, __mask) \
|
||||
_mm256_blend_pd(__v1, __v2, __mask)
|
||||
_mm256_blend_pd(__v1, __v2, __mask)
|
||||
|
||||
#define VEC_BLENDV(__v1, __v2, __maskV) \
|
||||
_mm256_blendv_pd(__v1, __v2, __maskV)
|
||||
_mm256_blendv_pd(__v1, __v2, __maskV)
|
||||
|
||||
#define VEC_CAST_256_128(__v1) \
|
||||
_mm256_castpd256_pd128 (__v1)
|
||||
#define VEC_CAST_256_128(__v1) \
|
||||
_mm256_castpd256_pd128 (__v1)
|
||||
|
||||
#define VEC_EXTRACT_128(__v1, __im) \
|
||||
_mm256_extractf128_pd (__v1, __im)
|
||||
#define VEC_EXTRACT_128(__v1, __im) \
|
||||
_mm256_extractf128_pd (__v1, __im)
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi64(__v1, __im)
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi64(__v1, __im)
|
||||
|
||||
#define VEC_SET1_VAL128(__val) \
|
||||
_mm_set1_pd(__val)
|
||||
#define VEC_SET1_VAL128(__val) \
|
||||
_mm_set1_pd(__val)
|
||||
|
||||
#define VEC_MOVE(__v1, __val) \
|
||||
_mm_move_sd(__v1, __val)
|
||||
#define VEC_MOVE(__v1, __val) \
|
||||
_mm_move_sd(__v1, __val)
|
||||
|
||||
#define VEC_CAST_128_256(__v1) \
|
||||
_mm256_castpd128_pd256(__v1)
|
||||
#define VEC_CAST_128_256(__v1) \
|
||||
_mm256_castpd128_pd256(__v1)
|
||||
|
||||
#define VEC_INSERT_VAL(__v1, __val, __pos) \
|
||||
_mm256_insertf128_pd(__v1, __val, __pos)
|
||||
#define VEC_INSERT_VAL(__v1, __val, __pos) \
|
||||
_mm256_insertf128_pd(__v1, __val, __pos)
|
||||
|
||||
#define VEC_CVT_128_256(__v1) \
|
||||
_mm256_cvtepi32_pd(__v1)
|
||||
#define VEC_CVT_128_256(__v1) \
|
||||
_mm256_cvtepi32_pd(__v1)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm256_set1_pd(__val)
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm256_set1_pd(__val)
|
||||
|
||||
#define VEC_POPCVT_CHAR(__ch) \
|
||||
_mm256_cvtepi32_pd(_mm_set1_epi32(__ch))
|
||||
#define VEC_POPCVT_CHAR(__ch) \
|
||||
_mm256_cvtepi32_pd(_mm_set1_epi32(__ch))
|
||||
|
||||
#define VEC_LDPOPCVT_CHAR(__addr) \
|
||||
_mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))
|
||||
_mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))
|
||||
|
||||
#define VEC_CMP_EQ(__v1, __v2) \
|
||||
_mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)
|
||||
#define VEC_CMP_EQ(__v1, __v2) \
|
||||
_mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)
|
||||
|
||||
#define VEC_SET_LSE(__val) \
|
||||
_mm256_set_pd(zero, zero, zero, __val);
|
||||
_mm256_set_pd(zero, zero, zero, __val);
|
||||
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
__v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
__v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
|
||||
|
||||
#define print256b(__v1) \
|
||||
print256bDP(__v1)
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm256_castpd128_pd256(__vsLow) ; \
|
||||
__vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;
|
||||
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm256_castpd128_pd256(__vsLow) ; \
|
||||
__vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi64(__vs, 1)
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi64(__vs, 1)
|
||||
|
||||
|
||||
#define COMPARE_VECS(__v1, __v2, __first, __last) { \
|
||||
double* ptr1 = (double*) (&__v1) ; \
|
||||
double* ptr2 = (double*) (&__v2) ; \
|
||||
for (int ei=__first; ei <= __last; ++ei) { \
|
||||
if (ptr1[ei] != ptr2[ei]) { \
|
||||
std::cout << "Double Mismatch at " << ei << ": " \
|
||||
<< ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \
|
||||
exit(0) ; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#define COMPARE_VECS(__v1, __v2, __first, __last) { \
|
||||
double* ptr1 = (double*) (&__v1) ; \
|
||||
double* ptr2 = (double*) (&__v2) ; \
|
||||
for (int ei=__first; ei <= __last; ++ei) { \
|
||||
if (ptr1[ei] != ptr2[ei]) { \
|
||||
std::cout << "Double Mismatch at " << ei << ": " \
|
||||
<< ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \
|
||||
exit(0) ; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
class BitMaskVec_double {
|
||||
|
||||
MASK_VEC low_, high_ ;
|
||||
_256_TYPE combined_ ;
|
||||
MASK_VEC low_, high_ ;
|
||||
SIMD_TYPE combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
public:
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
inline const SIMD_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,53 +1,51 @@
|
|||
#include <iostream>
|
||||
|
||||
#ifdef PRECISION
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef _256_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef MAVX_COUNT
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef SIMD_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef print256b(__v1)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define PRECISION s
|
||||
|
|
@ -61,127 +59,122 @@
|
|||
#define SHIFT_CONST2 3
|
||||
#define SHIFT_CONST3 4
|
||||
#define _128_TYPE __m128
|
||||
#define _256_TYPE __m256
|
||||
#define SIMD_TYPE __m256
|
||||
#define _256_INT_TYPE __m256i
|
||||
#define AVX_LENGTH 8
|
||||
#define MAVX_COUNT (MROWS+7)/AVX_LENGTH
|
||||
#define HAP_TYPE UNION_TYPE
|
||||
#define MASK_TYPE uint32_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFF
|
||||
#define MASK_VEC MaskVec_F
|
||||
|
||||
#define SET_VEC_ZERO(__vec) \
|
||||
__vec= _mm256_setzero_ps()
|
||||
__vec= _mm256_setzero_ps()
|
||||
|
||||
#define VEC_OR(__v1, __v2) \
|
||||
_mm256_or_ps(__v1, __v2)
|
||||
_mm256_or_ps(__v1, __v2)
|
||||
|
||||
#define VEC_ADD(__v1, __v2) \
|
||||
_mm256_add_ps(__v1, __v2)
|
||||
_mm256_add_ps(__v1, __v2)
|
||||
|
||||
#define VEC_SUB(__v1, __v2) \
|
||||
_mm256_sub_ps(__v1, __v2)
|
||||
_mm256_sub_ps(__v1, __v2)
|
||||
|
||||
#define VEC_MUL(__v1, __v2) \
|
||||
_mm256_mul_ps(__v1, __v2)
|
||||
_mm256_mul_ps(__v1, __v2)
|
||||
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm256_div_ps(__v1, __v2)
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm256_div_ps(__v1, __v2)
|
||||
|
||||
#define VEC_BLEND(__v1, __v2, __mask) \
|
||||
_mm256_blend_ps(__v1, __v2, __mask)
|
||||
_mm256_blend_ps(__v1, __v2, __mask)
|
||||
|
||||
#define VEC_BLENDV(__v1, __v2, __maskV) \
|
||||
_mm256_blendv_ps(__v1, __v2, __maskV)
|
||||
_mm256_blendv_ps(__v1, __v2, __maskV)
|
||||
|
||||
#define VEC_CAST_256_128(__v1) \
|
||||
_mm256_castps256_ps128 (__v1)
|
||||
#define VEC_CAST_256_128(__v1) \
|
||||
_mm256_castps256_ps128 (__v1)
|
||||
|
||||
#define VEC_EXTRACT_128(__v1, __im) \
|
||||
_mm256_extractf128_ps (__v1, __im)
|
||||
_mm256_extractf128_ps (__v1, __im)
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi32(__v1, __im)
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi32(__v1, __im)
|
||||
|
||||
#define VEC_SET1_VAL128(__val) \
|
||||
_mm_set1_ps(__val)
|
||||
#define VEC_SET1_VAL128(__val) \
|
||||
_mm_set1_ps(__val)
|
||||
|
||||
#define VEC_MOVE(__v1, __val) \
|
||||
_mm_move_ss(__v1, __val)
|
||||
#define VEC_MOVE(__v1, __val) \
|
||||
_mm_move_ss(__v1, __val)
|
||||
|
||||
#define VEC_CAST_128_256(__v1) \
|
||||
_mm256_castps128_ps256(__v1)
|
||||
_mm256_castps128_ps256(__v1)
|
||||
|
||||
#define VEC_INSERT_VAL(__v1, __val, __pos) \
|
||||
_mm256_insertf128_ps(__v1, __val, __pos)
|
||||
_mm256_insertf128_ps(__v1, __val, __pos)
|
||||
|
||||
#define VEC_CVT_128_256(__v1) \
|
||||
_mm256_cvtepi32_ps(__v1.i)
|
||||
_mm256_cvtepi32_ps(__v1.i)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm256_set1_ps(__val)
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm256_set1_ps(__val)
|
||||
|
||||
#define VEC_POPCVT_CHAR(__ch) \
|
||||
_mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))
|
||||
_mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))
|
||||
|
||||
#define VEC_LDPOPCVT_CHAR(__addr) \
|
||||
_mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))
|
||||
#define VEC_LDPOPCVT_CHAR(__addr) \
|
||||
_mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))
|
||||
|
||||
#define VEC_CMP_EQ(__v1, __v2) \
|
||||
_mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)
|
||||
_mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)
|
||||
|
||||
#define VEC_SET_LSE(__val) \
|
||||
_mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
|
||||
#define VEC_SET_LSE(__val) \
|
||||
_mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
|
||||
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
_vector_shift_lastavxs(__v1, __val.f);
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
_vector_shift_lastavxs(__v1, __val.f);
|
||||
|
||||
#define print256b(__v1) \
|
||||
print256bFP(__v1)
|
||||
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm256_castps128_ps256(__vsLow) ; \
|
||||
__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm256_castps128_ps256(__vsLow) ; \
|
||||
__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi32(__vs, 1)
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi32(__vs, 1)
|
||||
|
||||
#define COMPARE_VECS(__v1, __v2, __first, __last) { \
|
||||
float* ptr1 = (float*) (&__v1) ; \
|
||||
float* ptr2 = (float*) (&__v2) ; \
|
||||
for (int ei=__first; ei <= __last; ++ei) { \
|
||||
if (ptr1[ei] != ptr2[ei]) { \
|
||||
std::cout << "Float Mismatch at " << ei << ": " \
|
||||
<< ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \
|
||||
exit(0) ; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#define COMPARE_VECS(__v1, __v2, __first, __last) { \
|
||||
float* ptr1 = (float*) (&__v1) ; \
|
||||
float* ptr2 = (float*) (&__v2) ; \
|
||||
for (int ei=__first; ei <= __last; ++ei) { \
|
||||
if (ptr1[ei] != ptr2[ei]) { \
|
||||
std::cout << "Float Mismatch at " << ei << ": " \
|
||||
<< ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \
|
||||
exit(0) ; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
class BitMaskVec_float {
|
||||
|
||||
MASK_VEC low_, high_ ;
|
||||
_256_TYPE combined_ ;
|
||||
MASK_VEC low_, high_ ;
|
||||
SIMD_TYPE combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
public:
|
||||
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
inline const SIMD_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,53 +1,51 @@
|
|||
#ifdef PRECISION
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef _256_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef MAVX_COUNT
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef SIMD_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef print256b(__v1)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define SSE
|
||||
|
|
@ -62,90 +60,87 @@
|
|||
#define SHIFT_CONST2 8
|
||||
#define SHIFT_CONST3 0
|
||||
#define _128_TYPE __m128d
|
||||
#define _256_TYPE __m128d
|
||||
#define SIMD_TYPE __m128d
|
||||
#define _256_INT_TYPE __m128i
|
||||
#define AVX_LENGTH 2
|
||||
#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
|
||||
#define HAP_TYPE __m128i
|
||||
#define MASK_TYPE uint64_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL
|
||||
#define MASK_VEC MaskVec_D
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi64(__v1, __im)
|
||||
_mm_extract_epi64(__v1, __im)
|
||||
|
||||
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
|
||||
_mm_insert_epi64(__v1,__ins,__im)
|
||||
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
|
||||
_mm_insert_epi64(__v1,__ins,__im)
|
||||
|
||||
#define VEC_OR(__v1, __v2) \
|
||||
_mm_or_pd(__v1, __v2)
|
||||
_mm_or_pd(__v1, __v2)
|
||||
|
||||
#define VEC_ADD(__v1, __v2) \
|
||||
_mm_add_pd(__v1, __v2)
|
||||
_mm_add_pd(__v1, __v2)
|
||||
|
||||
#define VEC_SUB(__v1, __v2) \
|
||||
_mm_sub_pd(__v1, __v2)
|
||||
_mm_sub_pd(__v1, __v2)
|
||||
|
||||
#define VEC_MUL(__v1, __v2) \
|
||||
_mm_mul_pd(__v1, __v2)
|
||||
_mm_mul_pd(__v1, __v2)
|
||||
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm_div_pd(__v1, __v2)
|
||||
_mm_div_pd(__v1, __v2)
|
||||
|
||||
#define VEC_CMP_EQ(__v1, __v2) \
|
||||
_mm_cmpeq_pd(__v1, __v2)
|
||||
_mm_cmpeq_pd(__v1, __v2)
|
||||
|
||||
#define VEC_BLEND(__v1, __v2, __mask) \
|
||||
_mm_blend_pd(__v1, __v2, __mask)
|
||||
_mm_blend_pd(__v1, __v2, __mask)
|
||||
|
||||
#define VEC_BLENDV(__v1, __v2, __maskV) \
|
||||
_mm_blendv_pd(__v1, __v2, __maskV)
|
||||
_mm_blendv_pd(__v1, __v2, __maskV)
|
||||
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
__v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
|
||||
__v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
|
||||
|
||||
#define VEC_CVT_128_256(__v1) \
|
||||
_mm_cvtepi32_pd(__v1)
|
||||
_mm_cvtepi32_pd(__v1)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm_set1_pd(__val)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm_set1_pd(__val)
|
||||
|
||||
#define VEC_POPCVT_CHAR(__ch) \
|
||||
_mm_cvtepi32_pd(_mm_set1_epi32(__ch))
|
||||
_mm_cvtepi32_pd(_mm_set1_epi32(__ch))
|
||||
|
||||
#define VEC_SET_LSE(__val) \
|
||||
_mm_set_pd(zero, __val);
|
||||
_mm_set_pd(zero, __val);
|
||||
|
||||
#define VEC_LDPOPCVT_CHAR(__addr) \
|
||||
_mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr))
|
||||
_mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr))
|
||||
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
|
||||
__vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi64(__vs, 1)
|
||||
__vs = _mm_slli_epi64(__vs, 1)
|
||||
|
||||
|
||||
class BitMaskVec_sse_double {
|
||||
|
||||
MASK_VEC combined_ ;
|
||||
MASK_VEC combined_ ;
|
||||
public:
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
inline const SIMD_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,53 +1,51 @@
|
|||
#ifdef PRECISION
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef _256_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef MAVX_COUNT
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
#undef PRECISION
|
||||
#undef MAIN_TYPE
|
||||
#undef MAIN_TYPE_SIZE
|
||||
#undef UNION_TYPE
|
||||
#undef IF_128
|
||||
#undef IF_MAIN_TYPE
|
||||
#undef SHIFT_CONST1
|
||||
#undef SHIFT_CONST2
|
||||
#undef SHIFT_CONST3
|
||||
#undef _128_TYPE
|
||||
#undef SIMD_TYPE
|
||||
#undef AVX_LENGTH
|
||||
#undef HAP_TYPE
|
||||
#undef MASK_TYPE
|
||||
#undef MASK_ALL_ONES
|
||||
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef print256b(__v1)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
|
||||
#undef SET_VEC_ZERO(__vec)
|
||||
#undef VEC_OR(__v1, __v2)
|
||||
#undef VEC_ADD(__v1, __v2)
|
||||
#undef VEC_SUB(__v1, __v2)
|
||||
#undef VEC_MUL(__v1, __v2)
|
||||
#undef VEC_DIV(__v1, __v2)
|
||||
#undef VEC_BLEND(__v1, __v2, __mask)
|
||||
#undef VEC_BLENDV(__v1, __v2, __maskV)
|
||||
#undef VEC_CAST_256_128(__v1)
|
||||
#undef VEC_EXTRACT_128(__v1, __im)
|
||||
#undef VEC_EXTRACT_UNIT(__v1, __im)
|
||||
#undef VEC_SET1_VAL128(__val)
|
||||
#undef VEC_MOVE(__v1, __val)
|
||||
#undef VEC_CAST_128_256(__v1)
|
||||
#undef VEC_INSERT_VAL(__v1, __val, __pos)
|
||||
#undef VEC_CVT_128_256(__v1)
|
||||
#undef VEC_SET1_VAL(__val)
|
||||
#undef VEC_POPCVT_CHAR(__ch)
|
||||
#undef VEC_LDPOPCVT_CHAR(__addr)
|
||||
#undef VEC_CMP_EQ(__v1, __v2)
|
||||
#undef VEC_SET_LSE(__val)
|
||||
#undef SHIFT_HAP(__v1, __val)
|
||||
#undef MASK_VEC
|
||||
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
|
||||
#undef VEC_SHIFT_LEFT_1BIT(__vs)
|
||||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define SSE
|
||||
|
|
@ -62,89 +60,88 @@
|
|||
#define SHIFT_CONST2 4
|
||||
#define SHIFT_CONST3 0
|
||||
#define _128_TYPE __m128
|
||||
#define _256_TYPE __m128
|
||||
#define SIMD_TYPE __m128
|
||||
#define _256_INT_TYPE __m128i
|
||||
#define AVX_LENGTH 4
|
||||
#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
|
||||
//#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
|
||||
#define HAP_TYPE UNION_TYPE
|
||||
#define MASK_TYPE uint32_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFF
|
||||
#define MASK_VEC MaskVec_F
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi32(__v1, __im)
|
||||
_mm_extract_epi32(__v1, __im)
|
||||
|
||||
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
|
||||
_mm_insert_epi32(__v1,__ins,__im)
|
||||
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
|
||||
_mm_insert_epi32(__v1,__ins,__im)
|
||||
|
||||
#define VEC_OR(__v1, __v2) \
|
||||
_mm_or_ps(__v1, __v2)
|
||||
_mm_or_ps(__v1, __v2)
|
||||
|
||||
#define VEC_ADD(__v1, __v2) \
|
||||
_mm_add_ps(__v1, __v2)
|
||||
_mm_add_ps(__v1, __v2)
|
||||
|
||||
#define VEC_SUB(__v1, __v2) \
|
||||
_mm_sub_ps(__v1, __v2)
|
||||
_mm_sub_ps(__v1, __v2)
|
||||
|
||||
#define VEC_MUL(__v1, __v2) \
|
||||
_mm_mul_ps(__v1, __v2)
|
||||
_mm_mul_ps(__v1, __v2)
|
||||
|
||||
#define VEC_DIV(__v1, __v2) \
|
||||
_mm_div_ps(__v1, __v2)
|
||||
_mm_div_ps(__v1, __v2)
|
||||
|
||||
#define VEC_CMP_EQ(__v1, __v2) \
|
||||
_mm_cmpeq_ps(__v1, __v2)
|
||||
_mm_cmpeq_ps(__v1, __v2)
|
||||
|
||||
#define VEC_BLEND(__v1, __v2, __mask) \
|
||||
_mm_blend_ps(__v1, __v2, __mask)
|
||||
_mm_blend_ps(__v1, __v2, __mask)
|
||||
|
||||
#define VEC_BLENDV(__v1, __v2, __maskV) \
|
||||
_mm_blendv_ps(__v1, __v2, __maskV)
|
||||
_mm_blendv_ps(__v1, __v2, __maskV)
|
||||
|
||||
#define SHIFT_HAP(__v1, __val) \
|
||||
_vector_shift_lastsses(__v1, __val.f)
|
||||
_vector_shift_lastsses(__v1, __val.f)
|
||||
|
||||
#define VEC_CVT_128_256(__v1) \
|
||||
_mm_cvtepi32_ps(__v1.i)
|
||||
_mm_cvtepi32_ps(__v1.i)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm_set1_ps(__val)
|
||||
|
||||
#define VEC_SET1_VAL(__val) \
|
||||
_mm_set1_ps(__val)
|
||||
|
||||
#define VEC_POPCVT_CHAR(__ch) \
|
||||
_mm_cvtepi32_ps(_mm_set1_epi32(__ch))
|
||||
_mm_cvtepi32_ps(_mm_set1_epi32(__ch))
|
||||
|
||||
#define VEC_SET_LSE(__val) \
|
||||
_mm_set_ps(zero, zero, zero, __val);
|
||||
_mm_set_ps(zero, zero, zero, __val);
|
||||
|
||||
#define VEC_LDPOPCVT_CHAR(__addr) \
|
||||
_mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr))
|
||||
_mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr))
|
||||
|
||||
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
|
||||
__vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
|
||||
__vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_epi32(__vs, 1)
|
||||
__vs = _mm_slli_epi32(__vs, 1)
|
||||
|
||||
class BitMaskVec_sse_float {
|
||||
|
||||
MASK_VEC combined_ ;
|
||||
MASK_VEC combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
public:
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
inline const SIMD_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
haplotypeBasesLength = env->GetArrayLength(haplotypeBasesGlobalRef);
|
||||
#ifdef ENABLE_ASSERTIONS
|
||||
assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI");
|
||||
assert(haplotypeBasesLength < MCOLS);
|
||||
//assert(haplotypeBasesLength < MCOLS);
|
||||
#endif
|
||||
#ifdef DEBUG0_1
|
||||
cout << "JNI haplotype length "<<haplotypeBasesLength<<"\n";
|
||||
|
|
@ -145,7 +145,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
|
||||
assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
|
||||
assert(overallGCPArray && "overallGCP array not initialized in JNI");
|
||||
assert(readLength < MROWS);
|
||||
//assert(readLength < MROWS);
|
||||
assert(readLength == env->GetArrayLength(readQuals));
|
||||
assert(readLength == env->GetArrayLength(insertionGOP));
|
||||
assert(readLength == env->GetArrayLength(deletionGOP));
|
||||
|
|
|
|||
|
|
@ -4,435 +4,338 @@
|
|||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
//#define DEBUG
|
||||
#define MUSTAFA
|
||||
#define KARTHIK
|
||||
|
||||
/*
|
||||
template <class T>
|
||||
string getBinaryStr (T val, int numBitsToWrite) {
|
||||
|
||||
ostringstream oss ;
|
||||
uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
|
||||
for (int i=numBitsToWrite-1; i >= 0; --i) {
|
||||
oss << ((val & mask) >> i) ;
|
||||
mask >>= 1 ;
|
||||
}
|
||||
return oss.str() ;
|
||||
}
|
||||
*/
|
||||
#ifdef MUSTAFA
|
||||
void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
|
||||
|
||||
const int maskBitCnt = MAIN_TYPE_SIZE ;
|
||||
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
|
||||
|
||||
const int maskBitCnt = MAIN_TYPE_SIZE ;
|
||||
|
||||
for (int vi=0; vi < numMaskVecs; ++vi) {
|
||||
for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
|
||||
maskArr[vi][rs] = 0 ;
|
||||
for (int vi=0; vi < numMaskVecs; ++vi) {
|
||||
for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
|
||||
maskArr[vi][rs] = 0 ;
|
||||
}
|
||||
maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
|
||||
}
|
||||
maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
|
||||
}
|
||||
|
||||
for (int col=1; col < COLS; ++col) {
|
||||
int mIndex = (col-1) / maskBitCnt ;
|
||||
int mOffset = (col-1) % maskBitCnt ;
|
||||
MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
|
||||
|
||||
char hapChar = ConvertChar::get(tc.hap[col-1]);
|
||||
for (int col=1; col < COLS; ++col) {
|
||||
int mIndex = (col-1) / maskBitCnt ;
|
||||
int mOffset = (col-1) % maskBitCnt ;
|
||||
MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
|
||||
|
||||
if (hapChar == AMBIG_CHAR) {
|
||||
for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci)
|
||||
maskArr[mIndex][ci] |= bitMask ;
|
||||
}
|
||||
char hapChar = ConvertChar::get(tc.hap[col-1]);
|
||||
|
||||
maskArr[mIndex][hapChar] |= bitMask ;
|
||||
// bit corresponding to col 1 will be the MSB of the mask 0
|
||||
// bit corresponding to col 2 will be the MSB-1 of the mask 0
|
||||
// ...
|
||||
// bit corresponding to col 32 will be the LSB of the mask 0
|
||||
// bit corresponding to col 33 will be the MSB of the mask 1
|
||||
// ...
|
||||
}
|
||||
if (hapChar == AMBIG_CHAR) {
|
||||
for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci)
|
||||
maskArr[mIndex][ci] |= bitMask ;
|
||||
}
|
||||
|
||||
maskArr[mIndex][hapChar] |= bitMask ;
|
||||
// bit corresponding to col 1 will be the MSB of the mask 0
|
||||
// bit corresponding to col 2 will be the MSB-1 of the mask 0
|
||||
// ...
|
||||
// bit corresponding to col 32 will be the LSB of the mask 0
|
||||
// bit corresponding to col 33 will be the MSB of the mask 1
|
||||
// ...
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
|
||||
void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
|
||||
|
||||
for (int ri=0; ri < numRowsToProcess; ++ri) {
|
||||
rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
|
||||
}
|
||||
for (int ri=0; ri < numRowsToProcess; ++ri) {
|
||||
rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
|
||||
}
|
||||
|
||||
for (int ei=0; ei < AVX_LENGTH; ++ei) {
|
||||
lastMaskShiftOut[ei] = 0 ;
|
||||
}
|
||||
for (int ei=0; ei < AVX_LENGTH; ++ei) {
|
||||
lastMaskShiftOut[ei] = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
|
||||
MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \
|
||||
MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
|
||||
__dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \
|
||||
__lastShiftOut = __nextShiftOut ; \
|
||||
MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \
|
||||
MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
|
||||
__dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \
|
||||
__lastShiftOut = __nextShiftOut ; \
|
||||
}
|
||||
|
||||
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
|
||||
void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
|
||||
|
||||
for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
|
||||
SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]],
|
||||
lastMaskShiftOut[ei], ei, maskBitCnt) ;
|
||||
|
||||
int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
|
||||
SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]],
|
||||
lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
|
||||
}
|
||||
for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
|
||||
SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]],
|
||||
lastMaskShiftOut[ei], ei, maskBitCnt) ;
|
||||
|
||||
int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
|
||||
SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]],
|
||||
lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
|
||||
inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (BITMASK_VEC& bitMaskVec, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen) {
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
|
||||
//#define computeDistVec() {
|
||||
distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
|
||||
|
||||
#ifdef DEBUGG
|
||||
long long *temp1 = (long long *)(&maskV);
|
||||
double *temp2 = (double *)(&distm);
|
||||
double *temp3 = (double *)(&_1_distm);
|
||||
printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
|
||||
#endif
|
||||
bitMaskVec.shift_left_1bit() ;
|
||||
}
|
||||
|
||||
distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
|
||||
|
||||
/*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
|
||||
|
||||
bitMaskVec.shift_left_1bit() ;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
template<class NUMBER>
|
||||
struct HmmData {
|
||||
int ROWS ;
|
||||
int COLS ;
|
||||
|
||||
NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ;
|
||||
Context<NUMBER> ctx ;
|
||||
testcase* tc ;
|
||||
_256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ;
|
||||
_256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ;
|
||||
|
||||
UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ;
|
||||
UNION_TYPE rs , rsN ;
|
||||
_256_TYPE distmSel;
|
||||
_256_TYPE distm, _1_distm;
|
||||
|
||||
} ;
|
||||
*/
|
||||
#endif // MUSTAFA
|
||||
|
||||
template<class NUMBER> void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
|
||||
template<class NUMBER> void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D)
|
||||
{
|
||||
NUMBER zero = ctx._(0.0);
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
|
||||
NUMBER zero = ctx._(0.0);
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
|
||||
{
|
||||
shiftOutM[s] = zero;
|
||||
shiftOutX[s] = zero;
|
||||
shiftOutY[s] = init_Y;
|
||||
}
|
||||
|
||||
NUMBER *ptr_p_MM = (NUMBER *)p_MM;
|
||||
NUMBER *ptr_p_XX = (NUMBER *)p_XX;
|
||||
NUMBER *ptr_p_YY = (NUMBER *)p_YY;
|
||||
NUMBER *ptr_p_MX = (NUMBER *)p_MX;
|
||||
NUMBER *ptr_p_MY = (NUMBER *)p_MY;
|
||||
NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
|
||||
|
||||
*ptr_p_MM = ctx._(0.0);
|
||||
*ptr_p_XX = ctx._(0.0);
|
||||
*ptr_p_YY = ctx._(0.0);
|
||||
*ptr_p_MX = ctx._(0.0);
|
||||
*ptr_p_MY = ctx._(0.0);
|
||||
*ptr_p_GAPM = ctx._(0.0);
|
||||
|
||||
for (int r = 1; r < ROWS; r++)
|
||||
{
|
||||
int _i = tc->i[r-1] & 127;
|
||||
int _d = tc->d[r-1] & 127;
|
||||
int _c = tc->c[r-1] & 127;
|
||||
|
||||
*(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
|
||||
*(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
|
||||
*(ptr_p_MX+r-1) = ctx.ph2pr[_i];
|
||||
*(ptr_p_XX+r-1) = ctx.ph2pr[_c];
|
||||
*(ptr_p_MY+r-1) = ctx.ph2pr[_d];
|
||||
*(ptr_p_YY+r-1) = ctx.ph2pr[_c];
|
||||
}
|
||||
|
||||
NUMBER *ptr_distm1D = (NUMBER *)distm1D;
|
||||
for (int r = 1; r < ROWS; r++)
|
||||
{
|
||||
int _q = tc->q[r-1] & 127;
|
||||
ptr_distm1D[r-1] = ctx.ph2pr[_q];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class NUMBER> inline void CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(
|
||||
int stripeIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
|
||||
SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM ,
|
||||
SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
|
||||
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)
|
||||
{
|
||||
int i = stripeIdx;
|
||||
pGAPM = p_GAPM[i];
|
||||
pMM = p_MM[i];
|
||||
pMX = p_MX[i];
|
||||
pXX = p_XX[i];
|
||||
pMY = p_MY[i];
|
||||
pYY = p_YY[i];
|
||||
|
||||
NUMBER zero = ctx._(0.0);
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0);
|
||||
UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0);
|
||||
|
||||
distm = distm1D[i];
|
||||
_1_distm = VEC_SUB(packed1.d, distm);
|
||||
|
||||
distm = VEC_DIV(distm, packed3.d);
|
||||
|
||||
/* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */
|
||||
M_t_2.d = VEC_SET1_VAL(zero);
|
||||
X_t_2.d = VEC_SET1_VAL(zero);
|
||||
|
||||
if (i==0) {
|
||||
M_t_1.d = VEC_SET1_VAL(zero);
|
||||
X_t_1.d = VEC_SET1_VAL(zero);
|
||||
Y_t_2.d = VEC_SET_LSE(init_Y);
|
||||
Y_t_1.d = VEC_SET1_VAL(zero);
|
||||
}
|
||||
else {
|
||||
X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);
|
||||
M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);
|
||||
Y_t_2.d = VEC_SET1_VAL(zero);
|
||||
Y_t_1.d = VEC_SET1_VAL(zero);
|
||||
}
|
||||
M_t_1_y = M_t_1;
|
||||
}
|
||||
|
||||
inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256,
|
||||
SIMD_TYPE distm, SIMD_TYPE _1_distm)
|
||||
{
|
||||
UNION_TYPE hapN, rshap;
|
||||
SIMD_TYPE cond;
|
||||
IF_32 shiftInHap;
|
||||
|
||||
int *hap_ptr = tc->ihap;
|
||||
|
||||
shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];
|
||||
|
||||
/* shift hap */
|
||||
SHIFT_HAP(hap, shiftInHap);
|
||||
SIMD_TYPE hapF = VEC_CVT_128_256(hap);
|
||||
|
||||
rshap.d = VEC_CMP_EQ(rs, hapF);
|
||||
hapN.d = VEC_CMP_EQ(N_packed256, hapF);
|
||||
|
||||
/* OR rsN, rshap, hapN */
|
||||
cond = VEC_OR(rsN.d, rshap.d);
|
||||
cond = VEC_OR(cond, hapN.d);
|
||||
|
||||
/* distm1D = (cond) ? 1-distm1D : distm1D; */
|
||||
SIMD_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
|
||||
|
||||
return distmSel;
|
||||
}
|
||||
|
||||
|
||||
inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
|
||||
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
|
||||
SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel)
|
||||
{
|
||||
/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
|
||||
M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
|
||||
|
||||
M_t_y = M_t;
|
||||
|
||||
/* Compute X_t */
|
||||
X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
|
||||
|
||||
/* Compute Y_t */
|
||||
Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
|
||||
}
|
||||
|
||||
template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
|
||||
{
|
||||
int ROWS = tc->rslen + 1;
|
||||
int COLS = tc->haplen + 1;
|
||||
int MAVX_COUNT = (ROWS+AVX_LENGTH-1)/AVX_LENGTH;
|
||||
|
||||
SIMD_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT];
|
||||
SIMD_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT];
|
||||
SIMD_TYPE distm1D[MAVX_COUNT];
|
||||
NUMBER shiftOutM[ROWS+COLS+AVX_LENGTH], shiftOutX[ROWS+COLS+AVX_LENGTH], shiftOutY[ROWS+COLS+AVX_LENGTH];
|
||||
UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
|
||||
SIMD_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
|
||||
|
||||
struct timeval start, end;
|
||||
NUMBER result_avx2;
|
||||
Context<NUMBER> ctx;
|
||||
UNION_TYPE rs , rsN;
|
||||
HAP_TYPE hap;
|
||||
SIMD_TYPE distmSel, distmChosen ;
|
||||
SIMD_TYPE distm, _1_distm;
|
||||
|
||||
int r, c;
|
||||
NUMBER zero = ctx._(0.0);
|
||||
UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0);
|
||||
SIMD_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
int remainingRows = (ROWS-1) % AVX_LENGTH;
|
||||
int stripe_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
|
||||
|
||||
const int maskBitCnt = MAIN_TYPE_SIZE ;
|
||||
const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
|
||||
|
||||
MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
|
||||
CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
|
||||
|
||||
char rsArr[AVX_LENGTH] ;
|
||||
MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
|
||||
CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY,
|
||||
ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
|
||||
|
||||
for (int i=0;i<stripe_cnt-1;i++)
|
||||
{
|
||||
//STRIPE_INITIALIZATION
|
||||
CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
|
||||
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
|
||||
CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
|
||||
{
|
||||
shiftOutM[s] = zero;
|
||||
shiftOutX[s] = zero;
|
||||
shiftOutY[s] = init_Y;
|
||||
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
|
||||
CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
|
||||
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
|
||||
CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
int ShiftIdx = begin_d + mbi + AVX_LENGTH;
|
||||
|
||||
CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
|
||||
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
|
||||
|
||||
CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
|
||||
|
||||
CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
|
||||
|
||||
CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
|
||||
|
||||
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
|
||||
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NUMBER *ptr_p_MM = (NUMBER *)p_MM;
|
||||
NUMBER *ptr_p_XX = (NUMBER *)p_XX;
|
||||
NUMBER *ptr_p_YY = (NUMBER *)p_YY;
|
||||
NUMBER *ptr_p_MX = (NUMBER *)p_MX;
|
||||
NUMBER *ptr_p_MY = (NUMBER *)p_MY;
|
||||
NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
|
||||
int i = stripe_cnt-1;
|
||||
{
|
||||
//STRIPE_INITIALIZATION
|
||||
CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
|
||||
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
|
||||
|
||||
*ptr_p_MM = ctx._(0.0);
|
||||
*ptr_p_XX = ctx._(0.0);
|
||||
*ptr_p_YY = ctx._(0.0);
|
||||
*ptr_p_MX = ctx._(0.0);
|
||||
*ptr_p_MY = ctx._(0.0);
|
||||
*ptr_p_GAPM = ctx._(0.0);
|
||||
if (remainingRows==0) remainingRows=AVX_LENGTH;
|
||||
CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;
|
||||
|
||||
SIMD_TYPE sumM, sumX;
|
||||
sumM = VEC_SET1_VAL(zero);
|
||||
sumX = VEC_SET1_VAL(zero);
|
||||
|
||||
for (int r = 1; r < ROWS; r++)
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
|
||||
{
|
||||
int _i = tc->i[r-1] & 127;
|
||||
int _d = tc->d[r-1] & 127;
|
||||
int _c = tc->c[r-1] & 127;
|
||||
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
|
||||
CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
|
||||
*(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
|
||||
*(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
|
||||
*(ptr_p_MX+r-1) = ctx.ph2pr[_i];
|
||||
*(ptr_p_XX+r-1) = ctx.ph2pr[_c];
|
||||
#ifdef KARTHIK
|
||||
*(ptr_p_MY+r-1) = ctx.ph2pr[_d];
|
||||
*(ptr_p_YY+r-1) = ctx.ph2pr[_c];
|
||||
#else
|
||||
*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
|
||||
*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
|
||||
#endif
|
||||
|
||||
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
|
||||
|
||||
CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
int ShiftIdx = begin_d + mbi +AVX_LENGTH;
|
||||
|
||||
CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
|
||||
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
|
||||
|
||||
sumM = VEC_ADD(sumM, M_t.d);
|
||||
CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
|
||||
|
||||
sumX = VEC_ADD(sumX, X_t.d);
|
||||
CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
|
||||
|
||||
CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
|
||||
|
||||
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
|
||||
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
NUMBER *ptr_distm1D = (NUMBER *)distm1D;
|
||||
for (int r = 1; r < ROWS; r++)
|
||||
{
|
||||
int _q = tc->q[r-1] & 127;
|
||||
ptr_distm1D[r-1] = ctx.ph2pr[_q];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)(
|
||||
int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
|
||||
_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM ,
|
||||
_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
|
||||
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)
|
||||
{
|
||||
int i = stripIdx;
|
||||
pGAPM = p_GAPM[i];
|
||||
pMM = p_MM[i];
|
||||
pMX = p_MX[i];
|
||||
pXX = p_XX[i];
|
||||
pMY = p_MY[i];
|
||||
pYY = p_YY[i];
|
||||
|
||||
NUMBER zero = ctx._(0.0);
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0);
|
||||
UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0);
|
||||
/* compare rs and N */
|
||||
#ifndef MUSTAFA
|
||||
rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));
|
||||
rsN.d = VEC_CMP_EQ(N_packed256, rs);
|
||||
#endif
|
||||
distm = distm1D[i];
|
||||
_1_distm = VEC_SUB(packed1.d, distm);
|
||||
|
||||
#ifdef KARTHIK
|
||||
distm = VEC_DIV(distm, packed3.d);
|
||||
#endif
|
||||
/* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */
|
||||
M_t_2.d = VEC_SET1_VAL(zero);
|
||||
X_t_2.d = VEC_SET1_VAL(zero);
|
||||
|
||||
if (i==0) {
|
||||
M_t_1.d = VEC_SET1_VAL(zero);
|
||||
X_t_1.d = VEC_SET1_VAL(zero);
|
||||
Y_t_2.d = VEC_SET_LSE(init_Y);
|
||||
Y_t_1.d = VEC_SET1_VAL(zero);
|
||||
}
|
||||
else {
|
||||
X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);
|
||||
M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);
|
||||
Y_t_2.d = VEC_SET1_VAL(zero);
|
||||
Y_t_1.d = VEC_SET1_VAL(zero);
|
||||
}
|
||||
M_t_1_y = M_t_1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256,
|
||||
_256_TYPE distm, _256_TYPE _1_distm)
|
||||
{
|
||||
UNION_TYPE hapN, rshap;
|
||||
_256_TYPE cond;
|
||||
IF_32 shiftInHap;
|
||||
|
||||
int *hap_ptr = tc->ihap;
|
||||
|
||||
shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];
|
||||
|
||||
/* shift hap */
|
||||
SHIFT_HAP(hap, shiftInHap);
|
||||
_256_TYPE hapF = VEC_CVT_128_256(hap);
|
||||
|
||||
rshap.d = VEC_CMP_EQ(rs, hapF);
|
||||
hapN.d = VEC_CMP_EQ(N_packed256, hapF);
|
||||
|
||||
/* OR rsN, rshap, hapN */
|
||||
cond = VEC_OR(rsN.d, rshap.d);
|
||||
cond = VEC_OR(cond, hapN.d);
|
||||
|
||||
/* distm1D = (cond) ? 1-distm1D : distm1D; */
|
||||
_256_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
|
||||
|
||||
return distmSel;
|
||||
}
|
||||
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
|
||||
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
|
||||
_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
|
||||
{
|
||||
/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
|
||||
M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
|
||||
|
||||
#ifdef DEBUG
|
||||
double *temp1 = (double *)(&pGAPM);
|
||||
double *temp2 = (double *)(&pMM);
|
||||
double *temp3 = (double *)(&distmSel);
|
||||
printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
|
||||
//printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]);
|
||||
printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]);
|
||||
#endif
|
||||
M_t_y = M_t;
|
||||
|
||||
/* Compute X_t */
|
||||
X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
|
||||
|
||||
/* Compute Y_t */
|
||||
Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
|
||||
}
|
||||
|
||||
template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
|
||||
{
|
||||
_256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT];
|
||||
_256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT];
|
||||
_256_TYPE distm1D[MAVX_COUNT];
|
||||
NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH];
|
||||
UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
|
||||
_256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
|
||||
|
||||
struct timeval start, end;
|
||||
NUMBER result_avx2;
|
||||
Context<NUMBER> ctx;
|
||||
UNION_TYPE rs , rsN;
|
||||
HAP_TYPE hap;
|
||||
_256_TYPE distmSel, distmChosen ;
|
||||
_256_TYPE distm, _1_distm;
|
||||
|
||||
int r, c;
|
||||
int ROWS = tc->rslen + 1;
|
||||
int COLS = tc->haplen + 1;
|
||||
int AVX_COUNT = (ROWS+7)/8;
|
||||
NUMBER zero = ctx._(0.0);
|
||||
UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0);
|
||||
_256_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
|
||||
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
|
||||
int remainingRows = (ROWS-1) % AVX_LENGTH;
|
||||
int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
|
||||
|
||||
#ifdef MUSTAFA
|
||||
const int maskBitCnt = MAIN_TYPE_SIZE ;
|
||||
const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
|
||||
|
||||
MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
|
||||
|
||||
char rsArr[AVX_LENGTH] ;
|
||||
MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
|
||||
#endif
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY,
|
||||
ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
|
||||
|
||||
//for (int __ii=0; __ii < 10; ++__ii)
|
||||
for (int i=0;i<strip_cnt-1;i++)
|
||||
{
|
||||
//STRIP_INITIALIZATION
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
|
||||
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
|
||||
#endif
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
|
||||
{
|
||||
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
#endif
|
||||
|
||||
// if (d % MAIN_TYPE_SIZE == 1)
|
||||
|
||||
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
#else
|
||||
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
|
||||
#endif
|
||||
int ShiftIdx = begin_d + mbi + AVX_LENGTH;
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
|
||||
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
|
||||
|
||||
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
|
||||
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int i = strip_cnt-1;
|
||||
{
|
||||
//STRIP_INITIALIZATION
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
|
||||
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
|
||||
|
||||
if (remainingRows==0) remainingRows=AVX_LENGTH;
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;
|
||||
#endif
|
||||
_256_TYPE sumM, sumX;
|
||||
sumM = VEC_SET1_VAL(zero);
|
||||
sumX = VEC_SET1_VAL(zero);
|
||||
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
|
||||
{
|
||||
|
||||
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
#endif
|
||||
|
||||
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
|
||||
|
||||
#ifdef MUSTAFA
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
#else
|
||||
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
|
||||
#endif
|
||||
int ShiftIdx = begin_d + mbi +AVX_LENGTH;
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
|
||||
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
|
||||
|
||||
sumM = VEC_ADD(sumM, M_t.d);
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
|
||||
|
||||
sumX = VEC_ADD(sumX, X_t.d);
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
|
||||
|
||||
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
|
||||
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
|
||||
|
||||
}
|
||||
}
|
||||
UNION_TYPE sumMX;
|
||||
sumMX.d = VEC_ADD(sumM, sumX);
|
||||
result_avx2 = sumMX.f[remainingRows-1];
|
||||
}
|
||||
//printf("result_avx2: %f\n", result_avx2);
|
||||
return result_avx2;
|
||||
UNION_TYPE sumMX;
|
||||
sumMX.d = VEC_ADD(sumM, sumX);
|
||||
result_avx2 = sumMX.f[remainingRows-1];
|
||||
}
|
||||
return result_avx2;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
#include "headers.h"
|
||||
|
||||
#include "template.h"
|
||||
#include "vector_defs.h"
|
||||
|
||||
#define SIMD_TYPE avx
|
||||
#define SIMD_TYPE_AVX
|
||||
#define SIMD_ENGINE avx
|
||||
#define SIMD_ENGINE_AVX
|
||||
|
||||
|
||||
#define BATCH_SIZE 10000
|
||||
|
|
@ -13,127 +12,77 @@
|
|||
double getCurrClk();
|
||||
int thread_level_parallelism_enabled = false ;
|
||||
|
||||
void print128b_F(__m128 x)
|
||||
{
|
||||
float *p = (float *)(&x);
|
||||
for (int i=3;i>=0;i--)
|
||||
printf("%f ", *(p+i));
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print128b_D(__m128d x)
|
||||
{
|
||||
double *p = (double *)(&x);
|
||||
for (int i=1;i>=0;i--)
|
||||
printf("%f ", *(p+i));
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
/*
|
||||
IF_128f x;
|
||||
x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
|
||||
IF_32 shiftIn, shiftOut;
|
||||
shiftIn.f = 5.0f;
|
||||
print128b_F(x.f);
|
||||
GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut);
|
||||
print128b_F(x.f);
|
||||
testcase* tc = new testcase[BATCH_SIZE];
|
||||
float result[BATCH_SIZE], result_avxf;
|
||||
double result_avxd;
|
||||
double lastClk = 0.0 ;
|
||||
double aggregateTimeRead = 0.0;
|
||||
double aggregateTimeCompute = 0.0;
|
||||
double aggregateTimeWrite = 0.0;
|
||||
|
||||
IF_128d y;
|
||||
y.f = _mm_set_pd(10.0, 11.0);
|
||||
IF_64 shiftInd, shiftOutd;
|
||||
shiftInd.f = 12.0;
|
||||
print128b_D(y.f);
|
||||
GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd);
|
||||
print128b_D(y.f);
|
||||
// Need to call it once to initialize the static array
|
||||
ConvertChar::init() ;
|
||||
|
||||
exit(0);
|
||||
*/
|
||||
// char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
|
||||
// if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
|
||||
// thread_level_parallelism_enabled = true ;
|
||||
// }
|
||||
|
||||
testcase* tc = new testcase[BATCH_SIZE];
|
||||
float result[BATCH_SIZE], result_avxf;
|
||||
double result_avxd;
|
||||
//struct timeval start, end;
|
||||
double lastClk = 0.0 ;
|
||||
double aggregateTimeRead = 0.0;
|
||||
double aggregateTimeCompute = 0.0;
|
||||
double aggregateTimeWrite = 0.0;
|
||||
bool noMoreData = false;
|
||||
int count =0;
|
||||
while (!noMoreData)
|
||||
{
|
||||
int read_count = BATCH_SIZE;
|
||||
|
||||
// Need to call it once to initialize the static array
|
||||
ConvertChar::init() ;
|
||||
|
||||
lastClk = getCurrClk() ;
|
||||
for (int b=0;b<BATCH_SIZE;b++)
|
||||
if (read_testcase(&tc[b])==-1)
|
||||
{
|
||||
read_count = b;
|
||||
noMoreData = true;
|
||||
break;
|
||||
}
|
||||
aggregateTimeRead += (getCurrClk() - lastClk) ;
|
||||
lastClk = getCurrClk() ;
|
||||
|
||||
// char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
|
||||
// if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
|
||||
// thread_level_parallelism_enabled = true ;
|
||||
// }
|
||||
|
||||
bool noMoreData = false;
|
||||
int count =0;
|
||||
while (!noMoreData)
|
||||
//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
|
||||
for (int b=0;b<read_count;b++)
|
||||
{
|
||||
int read_count = BATCH_SIZE;
|
||||
|
||||
lastClk = getCurrClk() ;
|
||||
for (int b=0;b<BATCH_SIZE;b++)
|
||||
if (read_testcase(&tc[b])==-1)
|
||||
{
|
||||
read_count = b;
|
||||
noMoreData = true;
|
||||
break;
|
||||
}
|
||||
//gettimeofday(&end, NULL);
|
||||
aggregateTimeRead += (getCurrClk() - lastClk) ;
|
||||
//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
|
||||
result_avxf = CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), s)<float>(&tc[b]);
|
||||
|
||||
//gettimeofday(&start, NULL);
|
||||
lastClk = getCurrClk() ;
|
||||
|
||||
//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
|
||||
for (int b=0;b<read_count;b++)
|
||||
{
|
||||
result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)<float>(&tc[b]);
|
||||
|
||||
#ifdef RUN_HYBRID
|
||||
#define MIN_ACCEPTED 1e-28f
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
//printf("**************** RUNNING DOUBLE ******************\n");
|
||||
count++;
|
||||
result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)<double>(&tc[b]);
|
||||
result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
|
||||
}
|
||||
else
|
||||
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
|
||||
#endif
|
||||
|
||||
#ifndef RUN_HYBRID
|
||||
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
|
||||
#endif
|
||||
|
||||
}
|
||||
//gettimeofday(&end, NULL);
|
||||
aggregateTimeCompute += (getCurrClk() - lastClk) ;
|
||||
//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
|
||||
|
||||
//gettimeofday(&start, NULL);
|
||||
lastClk = getCurrClk() ;
|
||||
//for (int b=0;b<read_count;b++)
|
||||
//printf("%E\n", result[b]);
|
||||
//gettimeofday(&end, NULL);
|
||||
aggregateTimeWrite += (getCurrClk() - lastClk) ;
|
||||
//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
|
||||
#ifdef RUN_HYBRID
|
||||
#define MIN_ACCEPTED 1e-28f
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
count++;
|
||||
result_avxd = CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), d)<double>(&tc[b]);
|
||||
result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
|
||||
}
|
||||
else
|
||||
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
|
||||
#endif
|
||||
|
||||
#ifndef RUN_HYBRID
|
||||
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
|
||||
#endif
|
||||
}
|
||||
aggregateTimeCompute += (getCurrClk() - lastClk) ;
|
||||
lastClk = getCurrClk() ;
|
||||
for (int b=0;b<read_count;b++)
|
||||
printf("%E\n", result[b]);
|
||||
aggregateTimeWrite += (getCurrClk() - lastClk) ;
|
||||
}
|
||||
|
||||
delete tc;
|
||||
printf("AVX Read Time: %.2f\n", aggregateTimeRead);
|
||||
printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
|
||||
printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
|
||||
printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
|
||||
printf("# Double called: %d\n", count);
|
||||
delete tc;
|
||||
printf("AVX Read Time: %.2f\n", aggregateTimeRead);
|
||||
printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
|
||||
printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
|
||||
printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
|
||||
printf("# Double called: %d\n", count);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,88 +1,86 @@
|
|||
#ifdef PRECISION
|
||||
#ifdef PRECISION
|
||||
|
||||
#ifdef SIMD_TYPE_AVX
|
||||
#ifdef SIMD_ENGINE_AVX
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
|
||||
inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
|
||||
{
|
||||
|
||||
IF_128 xlow , xhigh;
|
||||
/* cast x to xlow */
|
||||
xlow.f = VEC_CAST_256_128(x.d);
|
||||
/* extract x,1 to xhigh */
|
||||
xhigh.f = VEC_EXTRACT_128(x.d, 1);
|
||||
/* extract xlow[3] */
|
||||
IF_128 shiftOutL128;
|
||||
shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
|
||||
/* extract xhigh[3] */
|
||||
IF_MAIN_TYPE shiftOutH;
|
||||
shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
|
||||
shiftOut = shiftOutH.f;
|
||||
/* shift xlow */
|
||||
xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
|
||||
/* shift xhigh */
|
||||
xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
|
||||
/*movss shiftIn to xlow[0] */
|
||||
_128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
|
||||
xlow.f = VEC_MOVE(xlow.f , shiftIn128);
|
||||
/*movss xlow[3] to xhigh[0] */
|
||||
xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
|
||||
/* cast xlow to x */
|
||||
x.d = VEC_CAST_128_256(xlow.f);
|
||||
/* insert xhigh to x,1 */
|
||||
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
|
||||
IF_128 xlow , xhigh;
|
||||
/* cast x to xlow */
|
||||
xlow.f = VEC_CAST_256_128(x.d);
|
||||
/* extract x,1 to xhigh */
|
||||
xhigh.f = VEC_EXTRACT_128(x.d, 1);
|
||||
/* extract xlow[3] */
|
||||
IF_128 shiftOutL128;
|
||||
shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
|
||||
/* extract xhigh[3] */
|
||||
IF_MAIN_TYPE shiftOutH;
|
||||
shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
|
||||
shiftOut = shiftOutH.f;
|
||||
/* shift xlow */
|
||||
xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
|
||||
/* shift xhigh */
|
||||
xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
|
||||
/*movss shiftIn to xlow[0] */
|
||||
_128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
|
||||
xlow.f = VEC_MOVE(xlow.f , shiftIn128);
|
||||
/*movss xlow[3] to xhigh[0] */
|
||||
xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
|
||||
/* cast xlow to x */
|
||||
x.d = VEC_CAST_128_256(xlow.f);
|
||||
/* insert xhigh to x,1 */
|
||||
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
|
||||
}
|
||||
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
|
||||
inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
|
||||
{
|
||||
|
||||
IF_128 xlow , xhigh;
|
||||
/* cast x to xlow */
|
||||
xlow.f = VEC_CAST_256_128(x.d);
|
||||
/* extract x,1 to xhigh */
|
||||
xhigh.f = VEC_EXTRACT_128(x.d, 1);
|
||||
/* extract xlow[3] */
|
||||
IF_128 shiftOutL128;
|
||||
shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
|
||||
/* shift xlow */
|
||||
xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
|
||||
/* shift xhigh */
|
||||
xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
|
||||
/*movss shiftIn to xlow[0] */
|
||||
_128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
|
||||
xlow.f = VEC_MOVE(xlow.f , shiftIn128);
|
||||
/*movss xlow[3] to xhigh[0] */
|
||||
xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
|
||||
/* cast xlow to x */
|
||||
x.d = VEC_CAST_128_256(xlow.f);
|
||||
/* insert xhigh to x,1 */
|
||||
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
|
||||
IF_128 xlow , xhigh;
|
||||
/* cast x to xlow */
|
||||
xlow.f = VEC_CAST_256_128(x.d);
|
||||
/* extract x,1 to xhigh */
|
||||
xhigh.f = VEC_EXTRACT_128(x.d, 1);
|
||||
/* extract xlow[3] */
|
||||
IF_128 shiftOutL128;
|
||||
shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
|
||||
/* shift xlow */
|
||||
xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
|
||||
/* shift xhigh */
|
||||
xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
|
||||
/*movss shiftIn to xlow[0] */
|
||||
_128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
|
||||
xlow.f = VEC_MOVE(xlow.f , shiftIn128);
|
||||
/*movss xlow[3] to xhigh[0] */
|
||||
xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
|
||||
/* cast xlow to x */
|
||||
x.d = VEC_CAST_128_256(xlow.f);
|
||||
/* insert xhigh to x,1 */
|
||||
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_TYPE_SSE
|
||||
#ifdef SIMD_ENGINE_SSE
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
|
||||
inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
|
||||
{
|
||||
IF_MAIN_TYPE tempIn, tempOut;
|
||||
tempIn.f = shiftIn;
|
||||
/* extratc H */
|
||||
tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
|
||||
shiftOut = tempOut.f;
|
||||
/* shift */
|
||||
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
|
||||
/* insert L */
|
||||
x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
|
||||
IF_MAIN_TYPE tempIn, tempOut;
|
||||
tempIn.f = shiftIn;
|
||||
/* extratc H */
|
||||
tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
|
||||
shiftOut = tempOut.f;
|
||||
/* shift */
|
||||
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
|
||||
/* insert L */
|
||||
x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
|
||||
}
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
|
||||
inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
|
||||
{
|
||||
IF_MAIN_TYPE temp; temp.f = shiftIn;
|
||||
/* shift */
|
||||
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
|
||||
/* insert L */
|
||||
x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
|
||||
IF_MAIN_TYPE temp; temp.f = shiftIn;
|
||||
/* shift */
|
||||
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
|
||||
/* insert L */
|
||||
x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "template.h"
|
||||
|
||||
#undef SIMD_TYPE
|
||||
#undef SIMD_TYPE_AVX
|
||||
#undef SIMD_ENGINE
|
||||
#undef SIMD_ENGINE_AVX
|
||||
|
||||
#define SIMD_TYPE sse
|
||||
#define SIMD_TYPE_SSE
|
||||
#define SIMD_ENGINE sse
|
||||
#define SIMD_ENGINE_SSE
|
||||
|
||||
#include "define-sse-float.h"
|
||||
#include "shift_template.c"
|
||||
|
|
|
|||
|
|
@ -10,11 +10,11 @@
|
|||
#define MY 4
|
||||
#define YY 5
|
||||
|
||||
#define MROWS 500
|
||||
#define MCOLS 1000
|
||||
//#define MROWS 500
|
||||
//#define MCOLS 1000
|
||||
|
||||
#define CAT(X,Y) X####Y
|
||||
#define GEN_INTRINSIC(X,Y) CAT(X,Y)
|
||||
#define CONCAT(X,Y) CAT(X,Y)
|
||||
|
||||
#define ALIGNED __attribute__((aligned(32)))
|
||||
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ int read_testcase(testcase *tc, FILE* ifp)
|
|||
|
||||
tc->haplen = strlen(tc->hap);
|
||||
tc->rslen = strlen(tc->rs);
|
||||
assert(tc->rslen < MROWS);
|
||||
//assert(tc->rslen < MROWS);
|
||||
tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
|
||||
tc->irs = (int *) malloc(tc->rslen*sizeof(int));
|
||||
|
||||
|
|
@ -216,7 +216,7 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
|
|||
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
|
||||
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
|
||||
assert(tokens.size() == 2 + 4*(tc->rslen));
|
||||
assert(tc->rslen < MROWS);
|
||||
//assert(tc->rslen < MROWS);
|
||||
for(unsigned j=0;j<tc->rslen;++j)
|
||||
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
|
||||
for(unsigned j=0;j<tc->rslen;++j)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#undef SIMD_TYPE
|
||||
#undef SIMD_TYPE_AVX
|
||||
#undef SIMD_TYPE_SSE
|
||||
#undef SIMD_ENGINE
|
||||
#undef SIMD_ENGINE_AVX
|
||||
#undef SIMD_ENGINE_SSE
|
||||
|
||||
#define SIMD_TYPE avx
|
||||
#define SIMD_TYPE_AVX
|
||||
#define SIMD_ENGINE avx
|
||||
#define SIMD_ENGINE_AVX
|
||||
|
||||
#include "define-float.h"
|
||||
#include "vector_function_prototypes.h"
|
||||
|
|
@ -11,11 +11,11 @@
|
|||
#include "define-double.h"
|
||||
#include "vector_function_prototypes.h"
|
||||
|
||||
#undef SIMD_TYPE
|
||||
#undef SIMD_TYPE_AVX
|
||||
#undef SIMD_ENGINE
|
||||
#undef SIMD_ENGINE_AVX
|
||||
|
||||
#define SIMD_TYPE sse
|
||||
#define SIMD_TYPE_SSE
|
||||
#define SIMD_ENGINE sse
|
||||
#define SIMD_ENGINE_SSE
|
||||
|
||||
|
||||
#include "define-sse-float.h"
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
#include "define-sse-double.h"
|
||||
#include "vector_function_prototypes.h"
|
||||
|
||||
#undef SIMD_TYPE
|
||||
#undef SIMD_TYPE_AVX
|
||||
#undef SIMD_TYPE_SSE
|
||||
#undef SIMD_ENGINE
|
||||
#undef SIMD_ENGINE_AVX
|
||||
#undef SIMD_ENGINE_SSE
|
||||
|
||||
|
|
|
|||
|
|
@ -1,19 +1,19 @@
|
|||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen);
|
||||
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D);
|
||||
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(
|
||||
int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
|
||||
_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM ,
|
||||
_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
|
||||
inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
|
||||
inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
|
||||
inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
|
||||
inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
|
||||
inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
|
||||
inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
|
||||
template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
|
||||
template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
|
||||
int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
|
||||
SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM ,
|
||||
SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
|
||||
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
|
||||
inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256,
|
||||
_256_TYPE distm, _256_TYPE _1_distm);
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
|
||||
inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256,
|
||||
SIMD_TYPE distm, SIMD_TYPE _1_distm);
|
||||
inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
|
||||
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
|
||||
_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel);
|
||||
template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
|
||||
SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
|
||||
template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
|
||||
|
||||
|
|
|
|||
26
build.xml
26
build.xml
|
|
@ -270,21 +270,21 @@
|
|||
<mkdir dir="${lib.dir}"/>
|
||||
<mkdir dir="${ivy.jar.dir}"/>
|
||||
|
||||
<!-- Comment out the following lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
|
||||
<!-- Comment out the following lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
|
||||
|
||||
<!--<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"-->
|
||||
<!--dest="${ivy.jar.dir}/${ivy.jar.file}"-->
|
||||
<!--usetimestamp="true"/>-->
|
||||
<taskdef resource="org/apache/ivy/ant/antlib.xml"
|
||||
uri="antlib:org.apache.ivy.ant"
|
||||
classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
|
||||
<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"
|
||||
dest="${ivy.jar.dir}/${ivy.jar.file}"
|
||||
usetimestamp="true"/>
|
||||
<taskdef resource="org/apache/ivy/ant/antlib.xml"
|
||||
uri="antlib:org.apache.ivy.ant"
|
||||
classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
|
||||
|
||||
<!--<get src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven-ant-tasks.install.version}/${maven-ant-tasks.jar.file}"-->
|
||||
<!--dest="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"-->
|
||||
<!--usetimestamp="true"/>-->
|
||||
<taskdef resource="org/apache/maven/artifact/ant/antlib.xml"
|
||||
uri="antlib:antlib:org.apache.maven.artifact.ant"
|
||||
classpath="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"/>
|
||||
<get src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven-ant-tasks.install.version}/${maven-ant-tasks.jar.file}"
|
||||
dest="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"
|
||||
usetimestamp="true"/>
|
||||
<taskdef resource="org/apache/maven/artifact/ant/antlib.xml"
|
||||
uri="antlib:antlib:org.apache.maven.artifact.ant"
|
||||
classpath="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"/>
|
||||
|
||||
<!-- End network lines -->
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue