diff --git a/PairHMM_JNI/avx_function_instantiations.cc b/PairHMM_JNI/avx_function_instantiations.cc index b236ddc8f..4118fc5cf 100644 --- a/PairHMM_JNI/avx_function_instantiations.cc +++ b/PairHMM_JNI/avx_function_instantiations.cc @@ -16,3 +16,4 @@ template double compute_full_prob_avxd(testcase* tc, double* nextlog); template float compute_full_prob_avxs(testcase* tc, float* nextlog); + diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h index 79c6b323f..502b919fe 100644 --- a/PairHMM_JNI/define-double.h +++ b/PairHMM_JNI/define-double.h @@ -47,6 +47,7 @@ #undef MASK_ALL_ONES #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE + #undef BITMASK_VEC #endif #define PRECISION d @@ -156,3 +157,32 @@ } \ } \ } + +class BitMaskVec_double { + + MASK_VEC low_, high_ ; + _256_TYPE combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const _256_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_double diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h index 25ebd1489..3cc57ec38 100644 --- a/PairHMM_JNI/define-float.h +++ b/PairHMM_JNI/define-float.h @@ -47,6 +47,7 @@ #undef MASK_ALL_ONES #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE + #undef BITMASK_VEC #endif #define PRECISION s @@ -157,3 +158,31 @@ } \ } +class BitMaskVec_float { + + MASK_VEC low_, high_ ; + _256_TYPE combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const _256_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_float diff --git a/PairHMM_JNI/define-sse-double.h b/PairHMM_JNI/define-sse-double.h index e48325ba9..a30b2e5f5 100644 --- a/PairHMM_JNI/define-sse-double.h +++ b/PairHMM_JNI/define-sse-double.h @@ -47,7 +47,7 @@ #undef MASK_ALL_ONES #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE - + #undef BITMASK_VEC #endif #define SSE @@ -69,7 +69,7 @@ #define HAP_TYPE __m128i #define MASK_TYPE uint64_t #define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL -#define MASK_VEC MaskVec_D128 +#define MASK_VEC MaskVec_D #define VEC_EXTRACT_UNIT(__v1, __im) \ _mm_extract_epi64(__v1, __im) @@ -123,6 +123,31 @@ __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) #define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_si64(__vs, 1) + __vs = _mm_slli_epi64(__vs, 1) +class BitMaskVec_sse_double { + + MASK_VEC combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const _256_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_double + diff --git a/PairHMM_JNI/define-sse-float.h b/PairHMM_JNI/define-sse-float.h index f5758c74a..6612b28e6 100644 --- a/PairHMM_JNI/define-sse-float.h +++ b/PairHMM_JNI/define-sse-float.h @@ -47,7 +47,7 @@ #undef MASK_ALL_ONES #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE - + #undef BITMASK_VEC #endif #define SSE @@ -69,7 +69,7 @@ #define HAP_TYPE UNION_TYPE #define MASK_TYPE uint32_t #define MASK_ALL_ONES 0xFFFFFFFF -#define MASK_VEC MaskVec_F128 +#define MASK_VEC MaskVec_F #define VEC_EXTRACT_UNIT(__v1, __im) \ _mm_extract_epi32(__v1, __im) @@ -123,5 +123,29 @@ __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) #define VEC_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_pi32(__vs, 1) + __vs = _mm_slli_epi32(__vs, 1) +class BitMaskVec_sse_float { + + MASK_VEC combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const _256_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_float diff --git a/PairHMM_JNI/libJNILoglessPairHMM.so b/PairHMM_JNI/libJNILoglessPairHMM.so index e18552b6b..eed821863 100755 Binary files a/PairHMM_JNI/libJNILoglessPairHMM.so and b/PairHMM_JNI/libJNILoglessPairHMM.so differ diff --git a/PairHMM_JNI/pairhmm-template-kernel.cc b/PairHMM_JNI/pairhmm-template-kernel.cc index a6b027ddd..66dc557aa 100644 --- a/PairHMM_JNI/pairhmm-template-kernel.cc +++ b/PairHMM_JNI/pairhmm-template-kernel.cc @@ -23,6 +23,7 @@ string getBinaryStr (T val, int numBitsToWrite) { */ #ifdef MUSTAFA + void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { const int maskBitCnt = MAIN_TYPE_SIZE ; @@ -77,28 +78,25 @@ void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(cons } -void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { +void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) { for (int ei=0; ei < AVX_LENGTH/2; ++ei) { - SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], + SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], lastMaskShiftOut[ei], ei, maskBitCnt) ; int ei2 = ei + AVX_LENGTH/2 ; // the second entry index - SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], + SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], lastMaskShiftOut[ei2], ei2, maskBitCnt) ; } } -//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { +//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { -inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { //#define computeDistVec() { - _256_TYPE maskV ; - VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; - #ifdef DEBUGG long long *temp1 = (long long *)(&maskV); double *temp2 = (double *)(&distm); @@ -106,16 +104,11 @@ inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) ( printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); #endif - distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; + distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ; /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ - VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; - VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; - -#ifdef SIMD_TYPE_SSE - _mm_empty(); -#endif + bitMaskVec.shift_left_1bit() ; } @@ -349,35 +342,41 @@ template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIM GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ; #endif // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors - MASK_VEC currMaskVecLow ; // corresponding to lower half - MASK_VEC currMaskVecHigh ; // corresponding to upper half - for (int d=1;d NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIM sumX = VEC_SET1_VAL(zero); // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors - MASK_VEC currMaskVecLow ; // corresponding to lower half - MASK_VEC currMaskVecHigh ; // corresponding to upper half + BITMASK_VEC bitMaskVec ; - for (int d=1;d