From d53e2fbe66849858ebc78191e43ecbf0eb772541 Mon Sep 17 00:00:00 2001 From: Intel Repocontact Date: Thu, 16 Jan 2014 21:55:04 -0800 Subject: [PATCH 1/2] Uncommenting download option in build.xml --- build.xml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/build.xml b/build.xml index 943b082cb..732beb568 100644 --- a/build.xml +++ b/build.xml @@ -270,21 +270,21 @@ - + - - - - + + - - - - + + From e7598dde8b1f0ab49e51a61c71bf3c1c626ea56b Mon Sep 17 00:00:00 2001 From: mghodrat Date: Sun, 26 Jan 2014 11:36:06 -0800 Subject: [PATCH 2/2] Clean up --- PairHMM_JNI/avx_function_instantiations.cc | 8 +- PairHMM_JNI/baseline.cc | 8 +- PairHMM_JNI/define-double.h | 236 +++--- PairHMM_JNI/define-float.h | 229 +++--- PairHMM_JNI/define-sse-double.h | 171 +++-- PairHMM_JNI/define-sse-float.h | 173 +++-- ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 6 +- PairHMM_JNI/pairhmm-template-kernel.cc | 689 ++++++++---------- PairHMM_JNI/pairhmm-template-main.cc | 169 ++--- PairHMM_JNI/shift_template.c | 136 ++-- PairHMM_JNI/sse_function_instantiations.cc | 8 +- PairHMM_JNI/template.h | 6 +- PairHMM_JNI/utils.cc | 4 +- PairHMM_JNI/vector_defs.h | 24 +- PairHMM_JNI/vector_function_prototypes.h | 32 +- 15 files changed, 863 insertions(+), 1036 deletions(-) diff --git a/PairHMM_JNI/avx_function_instantiations.cc b/PairHMM_JNI/avx_function_instantiations.cc index 4118fc5cf..8f0de827d 100644 --- a/PairHMM_JNI/avx_function_instantiations.cc +++ b/PairHMM_JNI/avx_function_instantiations.cc @@ -1,10 +1,10 @@ #include "template.h" -#undef SIMD_TYPE -#undef SIMD_TYPE_SSE +#undef SIMD_ENGINE +#undef SIMD_ENGINE_SSE -#define SIMD_TYPE avx -#define SIMD_TYPE_AVX +#define SIMD_ENGINE avx +#define SIMD_ENGINE_AVX #include "define-float.h" #include "shift_template.c" diff --git a/PairHMM_JNI/baseline.cc b/PairHMM_JNI/baseline.cc index b953c4436..2f80acdb0 100644 --- a/PairHMM_JNI/baseline.cc +++ b/PairHMM_JNI/baseline.cc @@ -10,10 +10,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) Context ctx; - NUMBER M[MROWS][MCOLS]; - NUMBER X[MROWS][MCOLS]; - NUMBER Y[MROWS][MCOLS]; - NUMBER p[MROWS][6]; + NUMBER M[ROWS][COLS]; + NUMBER X[ROWS][COLS]; + NUMBER Y[ROWS][COLS]; + NUMBER p[ROWS][6]; p[0][MM] = ctx._(0.0); p[0][GapM] = ctx._(0.0); diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h index 502b919fe..83589a13d 100644 --- a/PairHMM_JNI/define-double.h +++ b/PairHMM_JNI/define-double.h @@ -1,53 +1,51 @@ #include #ifdef PRECISION - #undef PRECISION - #undef MAIN_TYPE - #undef MAIN_TYPE_SIZE - #undef UNION_TYPE - #undef IF_128 - #undef IF_MAIN_TYPE - #undef SHIFT_CONST1 - #undef SHIFT_CONST2 - #undef SHIFT_CONST3 - #undef _128_TYPE - #undef _256_TYPE - #undef AVX_LENGTH - #undef MAVX_COUNT - #undef HAP_TYPE - #undef MASK_TYPE - #undef MASK_ALL_ONES +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES - #undef SET_VEC_ZERO(__vec) - #undef VEC_OR(__v1, __v2) - #undef VEC_ADD(__v1, __v2) - #undef VEC_SUB(__v1, __v2) - #undef VEC_MUL(__v1, __v2) - #undef VEC_DIV(__v1, __v2) - #undef VEC_BLEND(__v1, __v2, __mask) - #undef VEC_BLENDV(__v1, __v2, __maskV) - #undef VEC_CAST_256_128(__v1) - #undef VEC_EXTRACT_128(__v1, __im) - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_SET1_VAL128(__val) - #undef VEC_MOVE(__v1, __val) - #undef VEC_CAST_128_256(__v1) - #undef VEC_INSERT_VAL(__v1, __val, __pos) - #undef VEC_CVT_128_256(__v1) - #undef VEC_SET1_VAL(__val) - #undef VEC_POPCVT_CHAR(__ch) - #undef VEC_LDPOPCVT_CHAR(__addr) - #undef VEC_CMP_EQ(__v1, __v2) - #undef VEC_SET_LSE(__val) - #undef SHIFT_HAP(__v1, __val) - #undef print256b(__v1) - #undef MASK_VEC - #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) - #undef VEC_SHIFT_LEFT_1BIT(__vs) - #undef MASK_ALL_ONES - #undef COMPARE_VECS(__v1, __v2) - #undef _256_INT_TYPE - #undef BITMASK_VEC +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC #endif #define PRECISION d @@ -60,128 +58,122 @@ #define SHIFT_CONST2 1 #define SHIFT_CONST3 8 #define _128_TYPE __m128d -#define _256_TYPE __m256d +#define SIMD_TYPE __m256d #define _256_INT_TYPE __m256i #define AVX_LENGTH 4 -#define MAVX_COUNT (MROWS+7)/AVX_LENGTH #define HAP_TYPE __m128i #define MASK_TYPE uint64_t #define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF #define MASK_VEC MaskVec_D #define SET_VEC_ZERO(__vec) \ - __vec= _mm256_setzero_pd() + __vec= _mm256_setzero_pd() #define VEC_OR(__v1, __v2) \ - _mm256_or_pd(__v1, __v2) + _mm256_or_pd(__v1, __v2) #define VEC_ADD(__v1, __v2) \ - _mm256_add_pd(__v1, __v2) + _mm256_add_pd(__v1, __v2) #define VEC_SUB(__v1, __v2) \ - _mm256_sub_pd(__v1, __v2) + _mm256_sub_pd(__v1, __v2) #define VEC_MUL(__v1, __v2) \ - _mm256_mul_pd(__v1, __v2) + _mm256_mul_pd(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ - _mm256_div_pd(__v1, __v2) +#define VEC_DIV(__v1, __v2) \ + _mm256_div_pd(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ - _mm256_blend_pd(__v1, __v2, __mask) + _mm256_blend_pd(__v1, __v2, __mask) #define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm256_blendv_pd(__v1, __v2, __maskV) + _mm256_blendv_pd(__v1, __v2, __maskV) -#define VEC_CAST_256_128(__v1) \ - _mm256_castpd256_pd128 (__v1) +#define VEC_CAST_256_128(__v1) \ + _mm256_castpd256_pd128 (__v1) -#define VEC_EXTRACT_128(__v1, __im) \ - _mm256_extractf128_pd (__v1, __im) +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_pd (__v1, __im) -#define VEC_EXTRACT_UNIT(__v1, __im) \ - _mm_extract_epi64(__v1, __im) +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) -#define VEC_SET1_VAL128(__val) \ - _mm_set1_pd(__val) +#define VEC_SET1_VAL128(__val) \ + _mm_set1_pd(__val) -#define VEC_MOVE(__v1, __val) \ - _mm_move_sd(__v1, __val) +#define VEC_MOVE(__v1, __val) \ + _mm_move_sd(__v1, __val) -#define VEC_CAST_128_256(__v1) \ - _mm256_castpd128_pd256(__v1) +#define VEC_CAST_128_256(__v1) \ + _mm256_castpd128_pd256(__v1) -#define VEC_INSERT_VAL(__v1, __val, __pos) \ - _mm256_insertf128_pd(__v1, __val, __pos) +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_pd(__v1, __val, __pos) -#define VEC_CVT_128_256(__v1) \ - _mm256_cvtepi32_pd(__v1) +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_pd(__v1) -#define VEC_SET1_VAL(__val) \ - _mm256_set1_pd(__val) +#define VEC_SET1_VAL(__val) \ + _mm256_set1_pd(__val) -#define VEC_POPCVT_CHAR(__ch) \ - _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) #define VEC_LDPOPCVT_CHAR(__addr) \ - _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) + _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) -#define VEC_CMP_EQ(__v1, __v2) \ - _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) #define VEC_SET_LSE(__val) \ - _mm256_set_pd(zero, zero, zero, __val); + _mm256_set_pd(zero, zero, zero, __val); -#define SHIFT_HAP(__v1, __val) \ - __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) -#define print256b(__v1) \ - print256bDP(__v1) +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castpd128_pd256(__vsLow) ; \ +__vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; -#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm256_castpd128_pd256(__vsLow) ; \ - __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; - -#define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi64(__vs, 1) +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) -#define COMPARE_VECS(__v1, __v2, __first, __last) { \ - double* ptr1 = (double*) (&__v1) ; \ - double* ptr2 = (double*) (&__v2) ; \ - for (int ei=__first; ei <= __last; ++ei) { \ - if (ptr1[ei] != ptr2[ei]) { \ - std::cout << "Double Mismatch at " << ei << ": " \ - << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ - exit(0) ; \ - } \ - } \ - } +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + double* ptr1 = (double*) (&__v1) ; \ + double* ptr2 = (double*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Double Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ +} class BitMaskVec_double { - MASK_VEC low_, high_ ; - _256_TYPE combined_ ; + MASK_VEC low_, high_ ; + SIMD_TYPE combined_ ; -public: - - inline MASK_TYPE& getLowEntry(int index) { - return low_.masks[index] ; - } - inline MASK_TYPE& getHighEntry(int index) { - return high_.masks[index] ; - } - - inline const _256_TYPE& getCombinedMask() { - VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + public: + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } - return combined_ ; - } - - inline void shift_left_1bit() { - VEC_SHIFT_LEFT_1BIT(low_.vec) ; - VEC_SHIFT_LEFT_1BIT(high_.vec) ; - } + inline const SIMD_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } } ; diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h index 3cc57ec38..87b2b01f3 100644 --- a/PairHMM_JNI/define-float.h +++ b/PairHMM_JNI/define-float.h @@ -1,53 +1,51 @@ #include #ifdef PRECISION - #undef PRECISION - #undef MAIN_TYPE - #undef MAIN_TYPE_SIZE - #undef UNION_TYPE - #undef IF_128 - #undef IF_MAIN_TYPE - #undef SHIFT_CONST1 - #undef SHIFT_CONST2 - #undef SHIFT_CONST3 - #undef _128_TYPE - #undef _256_TYPE - #undef AVX_LENGTH - #undef MAVX_COUNT - #undef HAP_TYPE - #undef MASK_TYPE - #undef MASK_ALL_ONES +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES - #undef SET_VEC_ZERO(__vec) - #undef VEC_OR(__v1, __v2) - #undef VEC_ADD(__v1, __v2) - #undef VEC_SUB(__v1, __v2) - #undef VEC_MUL(__v1, __v2) - #undef VEC_DIV(__v1, __v2) - #undef VEC_BLEND(__v1, __v2, __mask) - #undef VEC_BLENDV(__v1, __v2, __maskV) - #undef VEC_CAST_256_128(__v1) - #undef VEC_EXTRACT_128(__v1, __im) - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_SET1_VAL128(__val) - #undef VEC_MOVE(__v1, __val) - #undef VEC_CAST_128_256(__v1) - #undef VEC_INSERT_VAL(__v1, __val, __pos) - #undef VEC_CVT_128_256(__v1) - #undef VEC_SET1_VAL(__val) - #undef VEC_POPCVT_CHAR(__ch) - #undef VEC_LDPOPCVT_CHAR(__addr) - #undef VEC_CMP_EQ(__v1, __v2) - #undef VEC_SET_LSE(__val) - #undef SHIFT_HAP(__v1, __val) - #undef print256b(__v1) - #undef MASK_VEC - #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) - #undef VEC_SHIFT_LEFT_1BIT(__vs) - #undef MASK_ALL_ONES - #undef COMPARE_VECS(__v1, __v2) - #undef _256_INT_TYPE - #undef BITMASK_VEC +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC #endif #define PRECISION s @@ -61,127 +59,122 @@ #define SHIFT_CONST2 3 #define SHIFT_CONST3 4 #define _128_TYPE __m128 -#define _256_TYPE __m256 +#define SIMD_TYPE __m256 #define _256_INT_TYPE __m256i #define AVX_LENGTH 8 -#define MAVX_COUNT (MROWS+7)/AVX_LENGTH #define HAP_TYPE UNION_TYPE #define MASK_TYPE uint32_t #define MASK_ALL_ONES 0xFFFFFFFF #define MASK_VEC MaskVec_F #define SET_VEC_ZERO(__vec) \ - __vec= _mm256_setzero_ps() + __vec= _mm256_setzero_ps() #define VEC_OR(__v1, __v2) \ - _mm256_or_ps(__v1, __v2) + _mm256_or_ps(__v1, __v2) #define VEC_ADD(__v1, __v2) \ - _mm256_add_ps(__v1, __v2) + _mm256_add_ps(__v1, __v2) #define VEC_SUB(__v1, __v2) \ - _mm256_sub_ps(__v1, __v2) + _mm256_sub_ps(__v1, __v2) #define VEC_MUL(__v1, __v2) \ - _mm256_mul_ps(__v1, __v2) + _mm256_mul_ps(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ - _mm256_div_ps(__v1, __v2) +#define VEC_DIV(__v1, __v2) \ + _mm256_div_ps(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ - _mm256_blend_ps(__v1, __v2, __mask) + _mm256_blend_ps(__v1, __v2, __mask) #define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm256_blendv_ps(__v1, __v2, __maskV) + _mm256_blendv_ps(__v1, __v2, __maskV) -#define VEC_CAST_256_128(__v1) \ - _mm256_castps256_ps128 (__v1) +#define VEC_CAST_256_128(__v1) \ + _mm256_castps256_ps128 (__v1) #define VEC_EXTRACT_128(__v1, __im) \ - _mm256_extractf128_ps (__v1, __im) + _mm256_extractf128_ps (__v1, __im) -#define VEC_EXTRACT_UNIT(__v1, __im) \ - _mm_extract_epi32(__v1, __im) +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) -#define VEC_SET1_VAL128(__val) \ - _mm_set1_ps(__val) +#define VEC_SET1_VAL128(__val) \ + _mm_set1_ps(__val) -#define VEC_MOVE(__v1, __val) \ - _mm_move_ss(__v1, __val) +#define VEC_MOVE(__v1, __val) \ + _mm_move_ss(__v1, __val) #define VEC_CAST_128_256(__v1) \ - _mm256_castps128_ps256(__v1) + _mm256_castps128_ps256(__v1) #define VEC_INSERT_VAL(__v1, __val, __pos) \ - _mm256_insertf128_ps(__v1, __val, __pos) + _mm256_insertf128_ps(__v1, __val, __pos) #define VEC_CVT_128_256(__v1) \ - _mm256_cvtepi32_ps(__v1.i) + _mm256_cvtepi32_ps(__v1.i) -#define VEC_SET1_VAL(__val) \ - _mm256_set1_ps(__val) +#define VEC_SET1_VAL(__val) \ + _mm256_set1_ps(__val) #define VEC_POPCVT_CHAR(__ch) \ - _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) + _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) -#define VEC_LDPOPCVT_CHAR(__addr) \ - _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) #define VEC_CMP_EQ(__v1, __v2) \ - _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) + _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) -#define VEC_SET_LSE(__val) \ - _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); +#define VEC_SET_LSE(__val) \ + _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); -#define SHIFT_HAP(__v1, __val) \ - _vector_shift_lastavxs(__v1, __val.f); +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lastavxs(__v1, __val.f); -#define print256b(__v1) \ - print256bFP(__v1) - -#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm256_castps128_ps256(__vsLow) ; \ - __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ +__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; -#define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi32(__vs, 1) +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) -#define COMPARE_VECS(__v1, __v2, __first, __last) { \ - float* ptr1 = (float*) (&__v1) ; \ - float* ptr2 = (float*) (&__v2) ; \ - for (int ei=__first; ei <= __last; ++ei) { \ - if (ptr1[ei] != ptr2[ei]) { \ - std::cout << "Float Mismatch at " << ei << ": " \ - << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ - exit(0) ; \ - } \ - } \ - } +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + float* ptr1 = (float*) (&__v1) ; \ + float* ptr2 = (float*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Float Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ +} class BitMaskVec_float { - MASK_VEC low_, high_ ; - _256_TYPE combined_ ; + MASK_VEC low_, high_ ; + SIMD_TYPE combined_ ; -public: - - inline MASK_TYPE& getLowEntry(int index) { - return low_.masks[index] ; - } - inline MASK_TYPE& getHighEntry(int index) { - return high_.masks[index] ; - } - - inline const _256_TYPE& getCombinedMask() { - VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + public: - return combined_ ; - } - - inline void shift_left_1bit() { - VEC_SHIFT_LEFT_1BIT(low_.vec) ; - VEC_SHIFT_LEFT_1BIT(high_.vec) ; - } + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } } ; diff --git a/PairHMM_JNI/define-sse-double.h b/PairHMM_JNI/define-sse-double.h index a30b2e5f5..d781d55f3 100644 --- a/PairHMM_JNI/define-sse-double.h +++ b/PairHMM_JNI/define-sse-double.h @@ -1,53 +1,51 @@ #ifdef PRECISION - #undef PRECISION - #undef MAIN_TYPE - #undef MAIN_TYPE_SIZE - #undef UNION_TYPE - #undef IF_128 - #undef IF_MAIN_TYPE - #undef SHIFT_CONST1 - #undef SHIFT_CONST2 - #undef SHIFT_CONST3 - #undef _128_TYPE - #undef _256_TYPE - #undef AVX_LENGTH - #undef MAVX_COUNT - #undef HAP_TYPE - #undef MASK_TYPE - #undef MASK_ALL_ONES +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_INSERT_UNIT(__v1,__ins,__im) - #undef SET_VEC_ZERO(__vec) - #undef VEC_OR(__v1, __v2) - #undef VEC_ADD(__v1, __v2) - #undef VEC_SUB(__v1, __v2) - #undef VEC_MUL(__v1, __v2) - #undef VEC_DIV(__v1, __v2) - #undef VEC_BLEND(__v1, __v2, __mask) - #undef VEC_BLENDV(__v1, __v2, __maskV) - #undef VEC_CAST_256_128(__v1) - #undef VEC_EXTRACT_128(__v1, __im) - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_SET1_VAL128(__val) - #undef VEC_MOVE(__v1, __val) - #undef VEC_CAST_128_256(__v1) - #undef VEC_INSERT_VAL(__v1, __val, __pos) - #undef VEC_CVT_128_256(__v1) - #undef VEC_SET1_VAL(__val) - #undef VEC_POPCVT_CHAR(__ch) - #undef VEC_LDPOPCVT_CHAR(__addr) - #undef VEC_CMP_EQ(__v1, __v2) - #undef VEC_SET_LSE(__val) - #undef SHIFT_HAP(__v1, __val) - #undef print256b(__v1) - #undef MASK_VEC - #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) - #undef VEC_SHIFT_LEFT_1BIT(__vs) - #undef MASK_ALL_ONES - #undef COMPARE_VECS(__v1, __v2) - #undef _256_INT_TYPE - #undef BITMASK_VEC +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_INSERT_UNIT(__v1,__ins,__im) +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC #endif #define SSE @@ -62,90 +60,87 @@ #define SHIFT_CONST2 8 #define SHIFT_CONST3 0 #define _128_TYPE __m128d -#define _256_TYPE __m128d +#define SIMD_TYPE __m128d #define _256_INT_TYPE __m128i #define AVX_LENGTH 2 -#define MAVX_COUNT (MROWS+3)/AVX_LENGTH #define HAP_TYPE __m128i #define MASK_TYPE uint64_t #define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL #define MASK_VEC MaskVec_D #define VEC_EXTRACT_UNIT(__v1, __im) \ - _mm_extract_epi64(__v1, __im) + _mm_extract_epi64(__v1, __im) -#define VEC_INSERT_UNIT(__v1,__ins,__im) \ - _mm_insert_epi64(__v1,__ins,__im) +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi64(__v1,__ins,__im) #define VEC_OR(__v1, __v2) \ - _mm_or_pd(__v1, __v2) + _mm_or_pd(__v1, __v2) #define VEC_ADD(__v1, __v2) \ - _mm_add_pd(__v1, __v2) + _mm_add_pd(__v1, __v2) #define VEC_SUB(__v1, __v2) \ - _mm_sub_pd(__v1, __v2) + _mm_sub_pd(__v1, __v2) #define VEC_MUL(__v1, __v2) \ - _mm_mul_pd(__v1, __v2) + _mm_mul_pd(__v1, __v2) #define VEC_DIV(__v1, __v2) \ - _mm_div_pd(__v1, __v2) + _mm_div_pd(__v1, __v2) #define VEC_CMP_EQ(__v1, __v2) \ - _mm_cmpeq_pd(__v1, __v2) + _mm_cmpeq_pd(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ - _mm_blend_pd(__v1, __v2, __mask) + _mm_blend_pd(__v1, __v2, __mask) #define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm_blendv_pd(__v1, __v2, __maskV) + _mm_blendv_pd(__v1, __v2, __maskV) #define SHIFT_HAP(__v1, __val) \ - __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) #define VEC_CVT_128_256(__v1) \ - _mm_cvtepi32_pd(__v1) + _mm_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_pd(__val) -#define VEC_SET1_VAL(__val) \ - _mm_set1_pd(__val) - #define VEC_POPCVT_CHAR(__ch) \ - _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) + _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) #define VEC_SET_LSE(__val) \ - _mm_set_pd(zero, __val); + _mm_set_pd(zero, __val); #define VEC_LDPOPCVT_CHAR(__addr) \ - _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) + _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) #define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) + __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) #define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi64(__vs, 1) + __vs = _mm_slli_epi64(__vs, 1) class BitMaskVec_sse_double { - MASK_VEC combined_ ; + MASK_VEC combined_ ; + public: + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } -public: - - inline MASK_TYPE& getLowEntry(int index) { - return combined_.masks[index] ; - } - inline MASK_TYPE& getHighEntry(int index) { - return combined_.masks[AVX_LENGTH/2+index] ; - } - - inline const _256_TYPE& getCombinedMask() { - return combined_.vecf ; - } - - inline void shift_left_1bit() { - VEC_SHIFT_LEFT_1BIT(combined_.vec) ; - } + inline const SIMD_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } } ; diff --git a/PairHMM_JNI/define-sse-float.h b/PairHMM_JNI/define-sse-float.h index 6612b28e6..7516e6dbf 100644 --- a/PairHMM_JNI/define-sse-float.h +++ b/PairHMM_JNI/define-sse-float.h @@ -1,53 +1,51 @@ #ifdef PRECISION - #undef PRECISION - #undef MAIN_TYPE - #undef MAIN_TYPE_SIZE - #undef UNION_TYPE - #undef IF_128 - #undef IF_MAIN_TYPE - #undef SHIFT_CONST1 - #undef SHIFT_CONST2 - #undef SHIFT_CONST3 - #undef _128_TYPE - #undef _256_TYPE - #undef AVX_LENGTH - #undef MAVX_COUNT - #undef HAP_TYPE - #undef MASK_TYPE - #undef MASK_ALL_ONES +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_INSERT_UNIT(__v1,__ins,__im) - #undef SET_VEC_ZERO(__vec) - #undef VEC_OR(__v1, __v2) - #undef VEC_ADD(__v1, __v2) - #undef VEC_SUB(__v1, __v2) - #undef VEC_MUL(__v1, __v2) - #undef VEC_DIV(__v1, __v2) - #undef VEC_BLEND(__v1, __v2, __mask) - #undef VEC_BLENDV(__v1, __v2, __maskV) - #undef VEC_CAST_256_128(__v1) - #undef VEC_EXTRACT_128(__v1, __im) - #undef VEC_EXTRACT_UNIT(__v1, __im) - #undef VEC_SET1_VAL128(__val) - #undef VEC_MOVE(__v1, __val) - #undef VEC_CAST_128_256(__v1) - #undef VEC_INSERT_VAL(__v1, __val, __pos) - #undef VEC_CVT_128_256(__v1) - #undef VEC_SET1_VAL(__val) - #undef VEC_POPCVT_CHAR(__ch) - #undef VEC_LDPOPCVT_CHAR(__addr) - #undef VEC_CMP_EQ(__v1, __v2) - #undef VEC_SET_LSE(__val) - #undef SHIFT_HAP(__v1, __val) - #undef print256b(__v1) - #undef MASK_VEC - #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) - #undef VEC_SHIFT_LEFT_1BIT(__vs) - #undef MASK_ALL_ONES - #undef COMPARE_VECS(__v1, __v2) - #undef _256_INT_TYPE - #undef BITMASK_VEC +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_INSERT_UNIT(__v1,__ins,__im) +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC #endif #define SSE @@ -62,89 +60,88 @@ #define SHIFT_CONST2 4 #define SHIFT_CONST3 0 #define _128_TYPE __m128 -#define _256_TYPE __m128 +#define SIMD_TYPE __m128 #define _256_INT_TYPE __m128i #define AVX_LENGTH 4 -#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +//#define MAVX_COUNT (MROWS+3)/AVX_LENGTH #define HAP_TYPE UNION_TYPE #define MASK_TYPE uint32_t #define MASK_ALL_ONES 0xFFFFFFFF #define MASK_VEC MaskVec_F #define VEC_EXTRACT_UNIT(__v1, __im) \ - _mm_extract_epi32(__v1, __im) + _mm_extract_epi32(__v1, __im) -#define VEC_INSERT_UNIT(__v1,__ins,__im) \ - _mm_insert_epi32(__v1,__ins,__im) +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi32(__v1,__ins,__im) #define VEC_OR(__v1, __v2) \ - _mm_or_ps(__v1, __v2) + _mm_or_ps(__v1, __v2) #define VEC_ADD(__v1, __v2) \ - _mm_add_ps(__v1, __v2) + _mm_add_ps(__v1, __v2) #define VEC_SUB(__v1, __v2) \ - _mm_sub_ps(__v1, __v2) + _mm_sub_ps(__v1, __v2) #define VEC_MUL(__v1, __v2) \ - _mm_mul_ps(__v1, __v2) + _mm_mul_ps(__v1, __v2) #define VEC_DIV(__v1, __v2) \ - _mm_div_ps(__v1, __v2) + _mm_div_ps(__v1, __v2) #define VEC_CMP_EQ(__v1, __v2) \ - _mm_cmpeq_ps(__v1, __v2) + _mm_cmpeq_ps(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ - _mm_blend_ps(__v1, __v2, __mask) + _mm_blend_ps(__v1, __v2, __mask) #define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm_blendv_ps(__v1, __v2, __maskV) + _mm_blendv_ps(__v1, __v2, __maskV) #define SHIFT_HAP(__v1, __val) \ - _vector_shift_lastsses(__v1, __val.f) + _vector_shift_lastsses(__v1, __val.f) #define VEC_CVT_128_256(__v1) \ - _mm_cvtepi32_ps(__v1.i) + _mm_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_ps(__val) -#define VEC_SET1_VAL(__val) \ - _mm_set1_ps(__val) - #define VEC_POPCVT_CHAR(__ch) \ - _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) + _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) #define VEC_SET_LSE(__val) \ - _mm_set_ps(zero, zero, zero, __val); + _mm_set_ps(zero, zero, zero, __val); #define VEC_LDPOPCVT_CHAR(__addr) \ - _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) + _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) #define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) + __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) #define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi32(__vs, 1) + __vs = _mm_slli_epi32(__vs, 1) class BitMaskVec_sse_float { - MASK_VEC combined_ ; + MASK_VEC combined_ ; -public: - - inline MASK_TYPE& getLowEntry(int index) { - return combined_.masks[index] ; - } - inline MASK_TYPE& getHighEntry(int index) { - return combined_.masks[AVX_LENGTH/2+index] ; - } - - inline const _256_TYPE& getCombinedMask() { - return combined_.vecf ; - } - - inline void shift_left_1bit() { - VEC_SHIFT_LEFT_1BIT(combined_.vec) ; - } + public: + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } } ; diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index d575b8271..4f754d019 100644 --- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -79,7 +79,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadL assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); assert(overallGCPArray && "OverallGCP array not initialized in JNI"); - assert(readLength < MROWS); + //assert(readLength < MROWS); #endif testcase tc; tc.rslen = readLength; @@ -167,7 +167,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBasesGlobalRef, &is_copy); #ifdef ENABLE_ASSERTIONS assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); - assert(env->GetArrayLength(haplotypeBasesGlobalRef) < MCOLS); + //assert(env->GetArrayLength(haplotypeBasesGlobalRef) < MCOLS); #endif #ifdef DEBUG0_1 cout << "JNI haplotype length "<GetArrayLength(haplotypeBasesGlobalRef)<<"\n"; @@ -240,7 +240,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); assert(overallGCPArray && "overallGCP array not initialized in JNI"); - assert(readLength < MROWS); + //assert(readLength < MROWS); assert(readLength == env->GetArrayLength(readQuals)); assert(readLength == env->GetArrayLength(insertionGOP)); assert(readLength == env->GetArrayLength(deletionGOP)); diff --git a/PairHMM_JNI/pairhmm-template-kernel.cc b/PairHMM_JNI/pairhmm-template-kernel.cc index 66dc557aa..9f59d7eeb 100644 --- a/PairHMM_JNI/pairhmm-template-kernel.cc +++ b/PairHMM_JNI/pairhmm-template-kernel.cc @@ -4,435 +4,338 @@ #include #include -//#define DEBUG -#define MUSTAFA -#define KARTHIK -/* -template -string getBinaryStr (T val, int numBitsToWrite) { - - ostringstream oss ; - uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; - for (int i=numBitsToWrite-1; i >= 0; --i) { - oss << ((val & mask) >> i) ; - mask >>= 1 ; - } - return oss.str() ; -} -*/ -#ifdef MUSTAFA +void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + const int maskBitCnt = MAIN_TYPE_SIZE ; -void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { - - const int maskBitCnt = MAIN_TYPE_SIZE ; - - for (int vi=0; vi < numMaskVecs; ++vi) { - for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { - maskArr[vi][rs] = 0 ; + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; } - maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; - } - - for (int col=1; col < COLS; ++col) { - int mIndex = (col-1) / maskBitCnt ; - int mOffset = (col-1) % maskBitCnt ; - MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; - char hapChar = ConvertChar::get(tc.hap[col-1]); + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; - if (hapChar == AMBIG_CHAR) { - for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) - maskArr[mIndex][ci] |= bitMask ; - } + char hapChar = ConvertChar::get(tc.hap[col-1]); - maskArr[mIndex][hapChar] |= bitMask ; - // bit corresponding to col 1 will be the MSB of the mask 0 - // bit corresponding to col 2 will be the MSB-1 of the mask 0 - // ... - // bit corresponding to col 32 will be the LSB of the mask 0 - // bit corresponding to col 33 will be the MSB of the mask 1 - // ... - } + if (hapChar == AMBIG_CHAR) { + for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) + maskArr[mIndex][ci] |= bitMask ; + } + + maskArr[mIndex][hapChar] |= bitMask ; + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } } -void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { +void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { - for (int ri=0; ri < numRowsToProcess; ++ri) { - rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; - } + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } - for (int ei=0; ei < AVX_LENGTH; ++ei) { - lastMaskShiftOut[ei] = 0 ; - } + for (int ei=0; ei < AVX_LENGTH; ++ei) { + lastMaskShiftOut[ei] = 0 ; + } } - #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ - MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ - MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ - __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ - __lastShiftOut = __nextShiftOut ; \ + MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ + MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ } -void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) { +void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) { - for (int ei=0; ei < AVX_LENGTH/2; ++ei) { - SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], - lastMaskShiftOut[ei], ei, maskBitCnt) ; - - int ei2 = ei + AVX_LENGTH/2 ; // the second entry index - SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], - lastMaskShiftOut[ei2], ei2, maskBitCnt) ; - } + for (int ei=0; ei < AVX_LENGTH/2; ++ei) { + SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index + SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } } -//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { +inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (BITMASK_VEC& bitMaskVec, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen) { -inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { - //#define computeDistVec() { + distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ; -#ifdef DEBUGG - long long *temp1 = (long long *)(&maskV); - double *temp2 = (double *)(&distm); - double *temp3 = (double *)(&_1_distm); - printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); -#endif + bitMaskVec.shift_left_1bit() ; +} - distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ; - - /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ - - bitMaskVec.shift_left_1bit() ; -} - - -/* -template -struct HmmData { - int ROWS ; - int COLS ; - - NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ; - Context ctx ; - testcase* tc ; - _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ; - _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ; - - UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ; - UNION_TYPE rs , rsN ; - _256_TYPE distmSel; - _256_TYPE distm, _1_distm; - -} ; -*/ -#endif // MUSTAFA - -template void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +template void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D) { - NUMBER zero = ctx._(0.0); - NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); - for (int s=0;shaplen); + for (int s=0;si[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + + *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; + *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; + *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; + } + + NUMBER *ptr_distm1D = (NUMBER *)distm1D; + for (int r = 1; r < ROWS; r++) + { + int _q = tc->q[r-1] & 127; + ptr_distm1D[r-1] = ctx.ph2pr[_q]; + } +} + + +template inline void CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)( + int stripeIdx, Context ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY, + SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , + SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) +{ + int i = stripeIdx; + pGAPM = p_GAPM[i]; + pMM = p_MM[i]; + pMX = p_MX[i]; + pXX = p_XX[i]; + pMY = p_MY[i]; + pYY = p_YY[i]; + + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0); + + distm = distm1D[i]; + _1_distm = VEC_SUB(packed1.d, distm); + + distm = VEC_DIV(distm, packed3.d); + + /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ + M_t_2.d = VEC_SET1_VAL(zero); + X_t_2.d = VEC_SET1_VAL(zero); + + if (i==0) { + M_t_1.d = VEC_SET1_VAL(zero); + X_t_1.d = VEC_SET1_VAL(zero); + Y_t_2.d = VEC_SET_LSE(init_Y); + Y_t_1.d = VEC_SET1_VAL(zero); + } + else { + X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); + M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); + Y_t_2.d = VEC_SET1_VAL(zero); + Y_t_1.d = VEC_SET1_VAL(zero); + } + M_t_1_y = M_t_1; +} + +inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, + SIMD_TYPE distm, SIMD_TYPE _1_distm) +{ + UNION_TYPE hapN, rshap; + SIMD_TYPE cond; + IF_32 shiftInHap; + + int *hap_ptr = tc->ihap; + + shiftInHap.i = (d NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +{ + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + int MAVX_COUNT = (ROWS+AVX_LENGTH-1)/AVX_LENGTH; + + SIMD_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; + SIMD_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; + SIMD_TYPE distm1D[MAVX_COUNT]; + NUMBER shiftOutM[ROWS+COLS+AVX_LENGTH], shiftOutX[ROWS+COLS+AVX_LENGTH], shiftOutY[ROWS+COLS+AVX_LENGTH]; + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; + SIMD_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; + + struct timeval start, end; + NUMBER result_avx2; + Context ctx; + UNION_TYPE rs , rsN; + HAP_TYPE hap; + SIMD_TYPE distmSel, distmChosen ; + SIMD_TYPE distm, _1_distm; + + int r, c; + NUMBER zero = ctx._(0.0); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + SIMD_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + int remainingRows = (ROWS-1) % AVX_LENGTH; + int stripe_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); + + const int maskBitCnt = MAIN_TYPE_SIZE ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + + char rsArr[AVX_LENGTH] ; + MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; + CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, + ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); + + for (int i=0;ii[r-1] & 127; - int _d = tc->d[r-1] & 127; - int _c = tc->c[r-1] & 127; + int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ; + CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ; - *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; - *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; - *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; - *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; - #ifdef KARTHIK - *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; - *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; - #else - *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; - #endif - + for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) { + + CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ; + int ShiftIdx = begin_d + mbi +AVX_LENGTH; + + CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, + pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen); + + sumM = VEC_ADD(sumM, M_t.d); + CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(M_t, shiftOutM[ShiftIdx]); + + sumX = VEC_ADD(sumX, X_t.d); + CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(X_t, shiftOutX[ShiftIdx]); + + CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]); + + M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t; + Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y; + + } } - - NUMBER *ptr_distm1D = (NUMBER *)distm1D; - for (int r = 1; r < ROWS; r++) - { - int _q = tc->q[r-1] & 127; - ptr_distm1D[r-1] = ctx.ph2pr[_q]; - } -} - - -template inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)( - int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, - _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , - _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, - UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) -{ - int i = stripIdx; - pGAPM = p_GAPM[i]; - pMM = p_MM[i]; - pMX = p_MX[i]; - pXX = p_XX[i]; - pMY = p_MY[i]; - pYY = p_YY[i]; - - NUMBER zero = ctx._(0.0); - NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); - UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); - UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0); - /* compare rs and N */ -#ifndef MUSTAFA - rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); - rsN.d = VEC_CMP_EQ(N_packed256, rs); -#endif - distm = distm1D[i]; - _1_distm = VEC_SUB(packed1.d, distm); - - #ifdef KARTHIK - distm = VEC_DIV(distm, packed3.d); - #endif - /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ - M_t_2.d = VEC_SET1_VAL(zero); - X_t_2.d = VEC_SET1_VAL(zero); - - if (i==0) { - M_t_1.d = VEC_SET1_VAL(zero); - X_t_1.d = VEC_SET1_VAL(zero); - Y_t_2.d = VEC_SET_LSE(init_Y); - Y_t_1.d = VEC_SET1_VAL(zero); - } - else { - X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); - M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); - Y_t_2.d = VEC_SET1_VAL(zero); - Y_t_1.d = VEC_SET1_VAL(zero); - } - M_t_1_y = M_t_1; -} - - - -inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, - _256_TYPE distm, _256_TYPE _1_distm) -{ - UNION_TYPE hapN, rshap; - _256_TYPE cond; - IF_32 shiftInHap; - - int *hap_ptr = tc->ihap; - - shiftInHap.i = (d NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) -{ - _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; - _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; - _256_TYPE distm1D[MAVX_COUNT]; - NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH]; - UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; - _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; - - struct timeval start, end; - NUMBER result_avx2; - Context ctx; - UNION_TYPE rs , rsN; - HAP_TYPE hap; - _256_TYPE distmSel, distmChosen ; - _256_TYPE distm, _1_distm; - - int r, c; - int ROWS = tc->rslen + 1; - int COLS = tc->haplen + 1; - int AVX_COUNT = (ROWS+7)/8; - NUMBER zero = ctx._(0.0); - UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); - _256_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); - NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); - int remainingRows = (ROWS-1) % AVX_LENGTH; - int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); - -#ifdef MUSTAFA - const int maskBitCnt = MAIN_TYPE_SIZE ; - const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function - - MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; - GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; - - char rsArr[AVX_LENGTH] ; - MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; -#endif - GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, - ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); - - //for (int __ii=0; __ii < 10; ++__ii) - for (int i=0;i=0;i--) - printf("%f ", *(p+i)); - printf("\n"); -} - -void print128b_D(__m128d x) -{ - double *p = (double *)(&x); - for (int i=1;i>=0;i--) - printf("%f ", *(p+i)); - printf("\n"); -} int main() { -/* - IF_128f x; - x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0); - IF_32 shiftIn, shiftOut; - shiftIn.f = 5.0f; - print128b_F(x.f); - GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut); - print128b_F(x.f); + testcase* tc = new testcase[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + double lastClk = 0.0 ; + double aggregateTimeRead = 0.0; + double aggregateTimeCompute = 0.0; + double aggregateTimeWrite = 0.0; - IF_128d y; - y.f = _mm_set_pd(10.0, 11.0); - IF_64 shiftInd, shiftOutd; - shiftInd.f = 12.0; - print128b_D(y.f); - GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd); - print128b_D(y.f); + // Need to call it once to initialize the static array + ConvertChar::init() ; - exit(0); -*/ + // char* ompEnvVar = getenv("OMP_NUM_THREADS") ; + // if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { + // thread_level_parallelism_enabled = true ; + // } - testcase* tc = new testcase[BATCH_SIZE]; - float result[BATCH_SIZE], result_avxf; - double result_avxd; - //struct timeval start, end; - double lastClk = 0.0 ; - double aggregateTimeRead = 0.0; - double aggregateTimeCompute = 0.0; - double aggregateTimeWrite = 0.0; + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; - // Need to call it once to initialize the static array - ConvertChar::init() ; - + lastClk = getCurrClk() ; + for (int b=0;b(&tc[b]); - //gettimeofday(&start, NULL); - lastClk = getCurrClk() ; - -//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled) - for (int b=0;b(&tc[b]); - - #ifdef RUN_HYBRID - #define MIN_ACCEPTED 1e-28f - if (result_avxf < MIN_ACCEPTED) { - //printf("**************** RUNNING DOUBLE ******************\n"); - count++; - result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)(&tc[b]); - result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); - } - else - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); - #endif - - #ifndef RUN_HYBRID - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); - #endif - - } - //gettimeofday(&end, NULL); - aggregateTimeCompute += (getCurrClk() - lastClk) ; - //((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); - - //gettimeofday(&start, NULL); - lastClk = getCurrClk() ; - //for (int b=0;b(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif +#ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif } + aggregateTimeCompute += (getCurrClk() - lastClk) ; + lastClk = getCurrClk() ; + for (int b=0;bhaplen = strlen(tc->hap); tc->rslen = strlen(tc->rs); - assert(tc->rslen < MROWS); + //assert(tc->rslen < MROWS); tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); tc->irs = (int *) malloc(tc->rslen*sizeof(int)); @@ -206,7 +206,7 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat) //cout << "Lengths "<haplen <<" "<rslen<<"\n"; memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); assert(tokens.size() == 2 + 4*(tc->rslen)); - assert(tc->rslen < MROWS); + //assert(tc->rslen < MROWS); for(unsigned j=0;jrslen;++j) tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]); for(unsigned j=0;jrslen;++j) diff --git a/PairHMM_JNI/vector_defs.h b/PairHMM_JNI/vector_defs.h index 7958480f2..80b48ae99 100644 --- a/PairHMM_JNI/vector_defs.h +++ b/PairHMM_JNI/vector_defs.h @@ -1,9 +1,9 @@ -#undef SIMD_TYPE -#undef SIMD_TYPE_AVX -#undef SIMD_TYPE_SSE +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX +#undef SIMD_ENGINE_SSE -#define SIMD_TYPE avx -#define SIMD_TYPE_AVX +#define SIMD_ENGINE avx +#define SIMD_ENGINE_AVX #include "define-float.h" #include "vector_function_prototypes.h" @@ -11,11 +11,11 @@ #include "define-double.h" #include "vector_function_prototypes.h" -#undef SIMD_TYPE -#undef SIMD_TYPE_AVX +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX -#define SIMD_TYPE sse -#define SIMD_TYPE_SSE +#define SIMD_ENGINE sse +#define SIMD_ENGINE_SSE #include "define-sse-float.h" @@ -24,7 +24,7 @@ #include "define-sse-double.h" #include "vector_function_prototypes.h" -#undef SIMD_TYPE -#undef SIMD_TYPE_AVX -#undef SIMD_TYPE_SSE +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX +#undef SIMD_ENGINE_SSE diff --git a/PairHMM_JNI/vector_function_prototypes.h b/PairHMM_JNI/vector_function_prototypes.h index ce9cc2abc..67a2667e1 100644 --- a/PairHMM_JNI/vector_function_prototypes.h +++ b/PairHMM_JNI/vector_function_prototypes.h @@ -1,19 +1,19 @@ -inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); -inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); -inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); -inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); -inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); -inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen); -template inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D); -template inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)( - int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, - _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , - _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, +inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); +inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); +inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); +inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); +inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); +inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen); +template inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D); +template inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)( + int stripIdx, Context ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY, + SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , + SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM); -inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, - _256_TYPE distm, _256_TYPE _1_distm); -inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, +inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, + SIMD_TYPE distm, SIMD_TYPE _1_distm); +inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, - _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel); -template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL); + SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel); +template NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);