Got rid of the MMX instructions in the SSE version of the code. Handling the mask operations in a class, which is defined for each version of SSE and AVX implementations separately.
This commit is contained in:
parent
7180c392af
commit
0170d4f3d5
|
|
@ -16,3 +16,4 @@
|
|||
|
||||
template double compute_full_prob_avxd<double>(testcase* tc, double* nextlog);
|
||||
template float compute_full_prob_avxs<float>(testcase* tc, float* nextlog);
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define PRECISION d
|
||||
|
|
@ -156,3 +157,32 @@
|
|||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
class BitMaskVec_double {
|
||||
|
||||
MASK_VEC low_, high_ ;
|
||||
_256_TYPE combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
#define BITMASK_VEC BitMaskVec_double
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define PRECISION s
|
||||
|
|
@ -157,3 +158,31 @@
|
|||
} \
|
||||
}
|
||||
|
||||
class BitMaskVec_float {
|
||||
|
||||
MASK_VEC low_, high_ ;
|
||||
_256_TYPE combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return low_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return high_.masks[index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
|
||||
|
||||
return combined_ ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
#define BITMASK_VEC BitMaskVec_float
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@
|
|||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define SSE
|
||||
|
|
@ -69,7 +69,7 @@
|
|||
#define HAP_TYPE __m128i
|
||||
#define MASK_TYPE uint64_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL
|
||||
#define MASK_VEC MaskVec_D128
|
||||
#define MASK_VEC MaskVec_D
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi64(__v1, __im)
|
||||
|
|
@ -123,6 +123,31 @@
|
|||
__vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_si64(__vs, 1)
|
||||
__vs = _mm_slli_epi64(__vs, 1)
|
||||
|
||||
|
||||
class BitMaskVec_sse_double {
|
||||
|
||||
MASK_VEC combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
#define BITMASK_VEC BitMaskVec_sse_double
|
||||
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@
|
|||
#undef MASK_ALL_ONES
|
||||
#undef COMPARE_VECS(__v1, __v2)
|
||||
#undef _256_INT_TYPE
|
||||
|
||||
#undef BITMASK_VEC
|
||||
#endif
|
||||
|
||||
#define SSE
|
||||
|
|
@ -69,7 +69,7 @@
|
|||
#define HAP_TYPE UNION_TYPE
|
||||
#define MASK_TYPE uint32_t
|
||||
#define MASK_ALL_ONES 0xFFFFFFFF
|
||||
#define MASK_VEC MaskVec_F128
|
||||
#define MASK_VEC MaskVec_F
|
||||
|
||||
#define VEC_EXTRACT_UNIT(__v1, __im) \
|
||||
_mm_extract_epi32(__v1, __im)
|
||||
|
|
@ -123,5 +123,29 @@
|
|||
__vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
|
||||
|
||||
#define VEC_SHIFT_LEFT_1BIT(__vs) \
|
||||
__vs = _mm_slli_pi32(__vs, 1)
|
||||
__vs = _mm_slli_epi32(__vs, 1)
|
||||
|
||||
class BitMaskVec_sse_float {
|
||||
|
||||
MASK_VEC combined_ ;
|
||||
|
||||
public:
|
||||
|
||||
inline MASK_TYPE& getLowEntry(int index) {
|
||||
return combined_.masks[index] ;
|
||||
}
|
||||
inline MASK_TYPE& getHighEntry(int index) {
|
||||
return combined_.masks[AVX_LENGTH/2+index] ;
|
||||
}
|
||||
|
||||
inline const _256_TYPE& getCombinedMask() {
|
||||
return combined_.vecf ;
|
||||
}
|
||||
|
||||
inline void shift_left_1bit() {
|
||||
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
|
||||
}
|
||||
|
||||
} ;
|
||||
|
||||
#define BITMASK_VEC BitMaskVec_sse_float
|
||||
|
|
|
|||
|
|
@ -19,6 +19,12 @@ LoadTimeInitializer g_load_time_initializer;
|
|||
#define BATCH_SIZE 10000
|
||||
#define RUN_HYBRID
|
||||
|
||||
double getCurrClk() {
|
||||
struct timeval tv ;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
if(argc < 2)
|
||||
|
|
@ -29,6 +35,10 @@ int main(int argc, char** argv)
|
|||
bool use_old_read_testcase = false;
|
||||
if(argc >= 3 && string(argv[2]) == "1")
|
||||
use_old_read_testcase = true;
|
||||
unsigned chunk_size = 100;
|
||||
if(argc >= 4)
|
||||
chunk_size = strtol(argv[3],0,10);
|
||||
|
||||
|
||||
initialize_function_pointers();
|
||||
|
||||
|
|
@ -45,12 +55,24 @@ int main(int argc, char** argv)
|
|||
assert(ifptr.is_open());
|
||||
}
|
||||
|
||||
vector<testcase> tc_vector;
|
||||
tc_vector.clear();
|
||||
testcase tc;
|
||||
while(1)
|
||||
{
|
||||
int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true);
|
||||
if(break_value < 0)
|
||||
break;
|
||||
tc_vector.push_back(tc);
|
||||
}
|
||||
vector<double> results_vec;
|
||||
results_vec.clear();
|
||||
results_vec.resize(tc_vector.size());
|
||||
double start_time = getCurrClk();
|
||||
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
|
||||
for(unsigned i=0;i<tc_vector.size();++i)
|
||||
{
|
||||
testcase& tc = tc_vector[i];
|
||||
float result_avxf = g_compute_full_prob_float(&tc, 0);
|
||||
double result = 0;
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
|
|
@ -60,12 +82,23 @@ int main(int argc, char** argv)
|
|||
else
|
||||
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
||||
|
||||
results_vec[i] = result;
|
||||
}
|
||||
cout << "Time taken "<<getCurrClk()-start_time << "\n";
|
||||
for(unsigned i=0;i<tc_vector.size();++i)
|
||||
{
|
||||
testcase& tc = tc_vector[i];
|
||||
double baseline_result = compute_full_prob<double>(&tc);
|
||||
baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0));
|
||||
cout << std::scientific << baseline_result << " "<<result<<"\n";
|
||||
delete tc.rs;
|
||||
delete tc.hap;
|
||||
double abs_error = fabs(baseline_result-results_vec[i]);
|
||||
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
|
||||
if(abs_error > 1e-5 && rel_error > 1e-5)
|
||||
cout << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
|
||||
delete tc_vector[i].rs;
|
||||
delete tc_vector[i].hap;
|
||||
}
|
||||
results_vec.clear();
|
||||
tc_vector.clear();
|
||||
if(use_old_read_testcase)
|
||||
fclose(fptr);
|
||||
else
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ string getBinaryStr (T val, int numBitsToWrite) {
|
|||
*/
|
||||
#ifdef MUSTAFA
|
||||
|
||||
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
|
||||
|
||||
const int maskBitCnt = MAIN_TYPE_SIZE ;
|
||||
|
|
@ -77,28 +78,25 @@ void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(cons
|
|||
}
|
||||
|
||||
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) {
|
||||
void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
|
||||
|
||||
for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
|
||||
SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]],
|
||||
SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]],
|
||||
lastMaskShiftOut[ei], ei, maskBitCnt) ;
|
||||
|
||||
int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
|
||||
SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]],
|
||||
SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]],
|
||||
lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
|
||||
//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
|
||||
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
|
||||
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
|
||||
//#define computeDistVec() {
|
||||
|
||||
_256_TYPE maskV ;
|
||||
VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ;
|
||||
|
||||
#ifdef DEBUGG
|
||||
long long *temp1 = (long long *)(&maskV);
|
||||
double *temp2 = (double *)(&distm);
|
||||
|
|
@ -106,14 +104,11 @@ inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (
|
|||
printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
|
||||
#endif
|
||||
|
||||
distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ;
|
||||
distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
|
||||
|
||||
/*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
|
||||
|
||||
VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ;
|
||||
VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ;
|
||||
|
||||
_mm_empty();
|
||||
bitMaskVec.shift_left_1bit() ;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -347,15 +342,16 @@ template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIM
|
|||
GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
|
||||
#endif
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
MASK_VEC currMaskVecLow ; // corresponding to lower half
|
||||
MASK_VEC currMaskVecHigh ; // corresponding to upper half
|
||||
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int d=1;d<COLS+AVX_LENGTH;d++)
|
||||
{
|
||||
#ifdef MUSTAFA
|
||||
if (d % MAIN_TYPE_SIZE == 1)
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
#else
|
||||
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
|
||||
#endif
|
||||
|
|
@ -391,15 +387,14 @@ template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIM
|
|||
sumX = VEC_SET1_VAL(zero);
|
||||
|
||||
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
|
||||
MASK_VEC currMaskVecLow ; // corresponding to lower half
|
||||
MASK_VEC currMaskVecHigh ; // corresponding to upper half
|
||||
BITMASK_VEC bitMaskVec ;
|
||||
|
||||
for (int d=1;d<COLS+remainingRows-1;d++)
|
||||
{
|
||||
#ifdef MUSTAFA
|
||||
if (d % MAIN_TYPE_SIZE == 1)
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
|
||||
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
|
||||
#else
|
||||
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -35,8 +35,7 @@ bool is_sse42_supported()
|
|||
|
||||
void initialize_function_pointers()
|
||||
{
|
||||
//if(is_avx_supported())
|
||||
if(false)
|
||||
if(0 && is_avx_supported())
|
||||
{
|
||||
cout << "Using AVX accelerated implementation of PairHMM\n";
|
||||
g_compute_full_prob_float = compute_full_prob_avxs<float>;
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#define SIMD_TYPE sse
|
||||
#define SIMD_TYPE_SSE
|
||||
|
||||
|
||||
#include "define-sse-float.h"
|
||||
#include "vector_function_prototypes.h"
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue