diff --git a/PairHMM_JNI/.gitignore b/PairHMM_JNI/.gitignore index 9722ad970..a68330124 100644 --- a/PairHMM_JNI/.gitignore +++ b/PairHMM_JNI/.gitignore @@ -8,3 +8,6 @@ pairhmm-template-main *.swp checker reformat +subdir_checkout.sh +avx/ +sse/ diff --git a/PairHMM_JNI/Makefile b/PairHMM_JNI/Makefile index 62dc3b0f6..b86958a6c 100644 --- a/PairHMM_JNI/Makefile +++ b/PairHMM_JNI/Makefile @@ -1,4 +1,4 @@ -OMPCFLAGS=-fopenmp +#OMPCFLAGS=-fopenmp #OMPLDFLAGS=-lgomp #CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas @@ -20,7 +20,7 @@ DEPDIR=.deps DF=$(DEPDIR)/$(*).d #Common across libJNI and sandbox -COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc +COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc sse_function_instantiations.cc #Part of libJNI LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc $(COMMON_SOURCES) SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc @@ -33,7 +33,7 @@ NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc pa #Use -xAVX for these files AVX_SOURCES=avx_function_instantiations.cc #Use -xSSE4.2 for these files -SSE_SOURCES= +SSE_SOURCES=sse_function_instantiations.cc NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o) AVX_OBJECTS=$(AVX_SOURCES:.cc=.o) diff --git a/PairHMM_JNI/avx_function_instantiations.cc b/PairHMM_JNI/avx_function_instantiations.cc index c29370561..b236ddc8f 100644 --- a/PairHMM_JNI/avx_function_instantiations.cc +++ b/PairHMM_JNI/avx_function_instantiations.cc @@ -1,5 +1,11 @@ #include "template.h" +#undef SIMD_TYPE +#undef SIMD_TYPE_SSE + +#define SIMD_TYPE avx +#define SIMD_TYPE_AVX + #include "define-float.h" #include "shift_template.c" #include "pairhmm-template-kernel.cc" diff --git a/PairHMM_JNI/avx_function_prototypes.h b/PairHMM_JNI/avx_function_prototypes.h deleted file mode 100644 index 1836a1c37..000000000 --- a/PairHMM_JNI/avx_function_prototypes.h +++ /dev/null @@ -1,19 +0,0 @@ -void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); -void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); -void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); -void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); -void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); -void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen); -template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D); -template void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( - int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, - _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , - _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, - UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM); -_256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, - _256_TYPE distm, _256_TYPE _1_distm); -void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, - UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, - _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel); -template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL); - diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h index 5d31741d6..79c6b323f 100644 --- a/PairHMM_JNI/define-double.h +++ b/PairHMM_JNI/define-double.h @@ -18,34 +18,34 @@ #undef MASK_TYPE #undef MASK_ALL_ONES - #undef SET_VEC_ZERO - #undef VEC_OR - #undef VEC_ADD - #undef VEC_SUB - #undef VEC_MUL - #undef VEC_DIV - #undef VEC_BLEND - #undef VEC_BLENDV - #undef VEC_CAST_256_128 - #undef VEC_EXTRACT_128 - #undef VEC_EXTRACT_UNIT - #undef VEC_SET1_VAL128 - #undef VEC_MOVE - #undef VEC_CAST_128_256 - #undef VEC_INSERT_VAL - #undef VEC_CVT_128_256 - #undef VEC_SET1_VAL - #undef VEC_POPCVT_CHAR - #undef VEC_LDPOPCVT_CHAR - #undef VEC_CMP_EQ - #undef VEC_SET_LSE - #undef SHIFT_HAP - #undef print256b + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) #undef MASK_VEC - #undef VEC_SSE_TO_AVX - #undef VEC_SHIFT_LEFT_1BIT + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) #undef MASK_ALL_ONES - #undef COMPARE_VECS + #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE #endif @@ -83,7 +83,7 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_pd(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ +#define VEC_DIV(__v1, __v2) \ _mm256_div_pd(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h index 0e2822b2d..25ebd1489 100644 --- a/PairHMM_JNI/define-float.h +++ b/PairHMM_JNI/define-float.h @@ -84,7 +84,7 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_ps(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ +#define VEC_DIV(__v1, __v2) \ _mm256_div_ps(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ @@ -133,7 +133,7 @@ _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); #define SHIFT_HAP(__v1, __val) \ - _vector_shift_lasts(__v1, __val.f); + _vector_shift_lastavxs(__v1, __val.f); #define print256b(__v1) \ print256bFP(__v1) diff --git a/PairHMM_JNI/define-sse-double.h b/PairHMM_JNI/define-sse-double.h new file mode 100644 index 000000000..e48325ba9 --- /dev/null +++ b/PairHMM_JNI/define-sse-double.h @@ -0,0 +1,128 @@ +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_INSERT_UNIT(__v1,__ins,__im) + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE + +#endif + +#define SSE +#define PRECISION d + +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D128 +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 1 +#define SHIFT_CONST2 8 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128d +#define _256_TYPE __m128d +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 2 +#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL +#define MASK_VEC MaskVec_D128 + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi64(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_pd(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_pd(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_pd(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_pd(zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_si64(__vs, 1) + + diff --git a/PairHMM_JNI/define-sse-float.h b/PairHMM_JNI/define-sse-float.h new file mode 100644 index 000000000..f5758c74a --- /dev/null +++ b/PairHMM_JNI/define-sse-float.h @@ -0,0 +1,127 @@ +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_INSERT_UNIT(__v1,__ins,__im) + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE + +#endif + +#define SSE +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F128 +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 3 +#define SHIFT_CONST2 4 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128 +#define _256_TYPE __m128 +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 4 +#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F128 + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi32(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_ps(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_ps(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_ps(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lastsses(__v1, __val.f) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_ps(zero, zero, zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_pi32(__vs, 1) + diff --git a/PairHMM_JNI/headers.h b/PairHMM_JNI/headers.h index 42485fc51..f502ff5ce 100644 --- a/PairHMM_JNI/headers.h +++ b/PairHMM_JNI/headers.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/PairHMM_JNI/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc deleted file mode 100644 index c335a0160..000000000 --- a/PairHMM_JNI/hmm_mask.cc +++ /dev/null @@ -1,451 +0,0 @@ -#include -#include -#include -#include - - -// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static - - -#include - -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include -#include "template.h" - -using namespace std ; - -#define CHECK_MASK_CORRECTNESS - -template -string getBinaryStr (T val, int numBitsToWrite) { - - ostringstream oss ; - uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; - for (int i=numBitsToWrite-1; i >= 0; --i) { - oss << ((val & mask) >> i) ; - mask >>= 1 ; - } - return oss.str() ; -} - -int normalize(char c) -{ - return ((int) (c - 33)); -} - - -uint8_t ConvertChar::conversionTable[255] ; - -int read_testcase(testcase *tc, FILE* ifp) -{ - char *q, *i, *d, *c, *line = NULL; - int _q, _i, _d, _c; - int x, size = 0; - ssize_t read; - - read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); - if (read == -1) - return -1; - - - tc->hap = (char *) malloc(size); - tc->rs = (char *) malloc(size); - q = (char *) malloc(size); - i = (char *) malloc(size); - d = (char *) malloc(size); - c = (char *) malloc(size); - - if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) - return -1; - - tc->haplen = strlen(tc->hap); - tc->rslen = strlen(tc->rs); - assert(tc->rslen < MROWS); - tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); - tc->irs = (int *) malloc(tc->rslen*sizeof(int)); - - //tc->q = (int *) malloc(sizeof(int) * tc->rslen); - //tc->i = (int *) malloc(sizeof(int) * tc->rslen); - //tc->d = (int *) malloc(sizeof(int) * tc->rslen); - //tc->c = (int *) malloc(sizeof(int) * tc->rslen); - - for (x = 0; x < tc->rslen; x++) - { - _q = normalize(q[x]); - _i = normalize(i[x]); - _d = normalize(d[x]); - _c = normalize(c[x]); - tc->q[x] = (_q < 6) ? 6 : _q; - tc->i[x] = _i; - tc->d[x] = _d; - tc->c[x] = _c; - tc->irs[x] = tc->rs[x]; - } - for (x = 0; x < tc->haplen; x++) - tc->ihap[x] = tc->hap[x]; - - - free(q); - free(i); - free(d); - free(c); - free(line); - - return 0; -} - - - - -#define ALIGN __attribute__ ((aligned(32))) - - -typedef union ALIGN { - //__m256i vi; - __m128 vf ; - __m128i vi ; - __m128d vd; - - uint8_t b[16]; - //uint16_t hw[8] ; - uint32_t w[4] ; - uint64_t dw[2] ; - - float f[4] ; - //double d[2] ; - -} v128 ; - -typedef union ALIGN { - __m256i vi; - __m256 vf ; - __m256d vd; - //__m128i vi128[2] ; - - uint8_t b[32]; - //uint16_t hw[16] ; - uint32_t w[8] ; - uint64_t dw[4] ; - - float f[8] ; - double d[4] ; - -} v256 ; - - -#define NUM_DISTINCT_CHARS 5 -#define AMBIG_CHAR 4 - -#define VEC_ENTRY_CNT 8 -#define VEC_LEN 256 -#define VTYPE vf -#define VTYPEI vi -#define VENTRY f -#define VENTRYI w -#define MTYPE uint32_t -#define MASK_ALL_ONES 0xFFFFFFFF - - -#define VECTOR v256 -#define VECTOR_SSE v128 - -#define SET_VEC_ZERO(__vec) \ - __vec= _mm256_setzero_ps() - -#define SET_VEC_ONES(__vec) \ - __vec = _mm256_set1_epi32(0xFFFFFFFF) - -#define VEC_OR(__v1, __v2) \ - _mm256_or_ps(__v1, __v2) - -#define VEC_ADD(__v1, __v2) \ - _mm256_add_ps(__v1, __v2) - -#define VEC_MUL(__v1, __v2) \ - _mm256_mul_ps(__v1, __v2) - -#define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm256_blendv_ps(__v1, __v2, __maskV) - -#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm256_castps128_ps256(__vsLow) ; \ - __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; - -#define VECS_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi32(__vs, 1) - - -#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \ - uint64_t __mask = ; \ - uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \ - __shiftWord <<= (64-cnt) ; \ - __vs = _mm_slri_epi64(__vs, __cnt) ; \ - __vs.dw[0] |= __shiftWord ; \ - } - - -#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ - MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ - MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ - __mask >>= __shiftBy ; \ - __mask |= __lastShiftOut ; \ - __lastShiftOut = __nextShiftOut ; \ - } - - -#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ - MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ - MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ - __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ - __lastShiftOut = __nextShiftOut ; \ -} - - -void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs, - MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) { - - const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; - - for (int vi=0; vi < numMaskVecs; ++vi) { - for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { - maskArr[vi][rs] = 0 ; - } - maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; - } - - for (int col=1; col < COLS; ++col) { - int mIndex = (col-1) / maskBitCnt ; - int mOffset = (col-1) % maskBitCnt ; - MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ; - - char hapChar = tc.hap[col-1] ; - - if (hapChar == AMBIG_CHAR) { - maskArr[mIndex][0] |= bitMask ; - maskArr[mIndex][1] |= bitMask ; - maskArr[mIndex][2] |= bitMask ; - maskArr[mIndex][3] |= bitMask ; - } - - //cout << hapChar << " " << mIndex << " " << getBinaryStr(bitMask, 32) - // << endl ; - //cout << getBinaryStr(maskArr[0][hapChar],32) << endl ; - //exit(0) ; - - maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; - - - // bit corresponding to col 1 will be the MSB of the mask 0 - // bit corresponding to col 2 will be the MSB-1 of the mask 0 - // ... - // bit corresponding to col 32 will be the LSB of the mask 0 - // bit corresponding to col 33 will be the MSB of the mask 1 - // ... - } - -} - - -void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { - - int ROWS = tc.rslen + 1 ; - int COLS = tc.haplen + 1 ; - - // only for testing - VECTOR mismatchData, matchData ; - //SET_VEC_ZERO(mismatchData.VTYPE) ; - //SET_VEC_ONES(matchData.VTYPEI) ; - for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) { - matchData.VENTRY[ei] = 1.0 ; - mismatchData.VENTRY[ei] = 0.0 ; - } - - const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; - const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function - - MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; - precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ; - -#ifdef DEBUG - if (printDebug) { - cout << "The first 32 hap chars are: " ; - for (int i=0; i < 32; ++i) { - cout << tc.hap[i] ; - } - cout << endl ; - - cout << "Masks computed for A, C, T, G, N are: " << endl ; - cout << getBinaryStr(maskArr[0][0], 32) << endl ; - cout << getBinaryStr(maskArr[0][1], 32) << endl ; - cout << getBinaryStr(maskArr[0][2], 32) << endl ; - cout << getBinaryStr(maskArr[0][3], 32) << endl ; - cout << getBinaryStr(maskArr[0][4], 32) << endl ; - } -#endif // #ifdef DEBUG - - int beginRowIndex = 1 ; - while (beginRowIndex < ROWS) { - - int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ; - - char rsArr[VEC_ENTRY_CNT] ; - for (int ri=0; ri < numRowsToProcess; ++ri) { - rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; - } - - // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors - VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3 - VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7 - - MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ; - for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) - lastMaskShiftOut[ei] = 0 ; - - int col = 1 ; - int diag = 1 ; - for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) { - // set up the mask vectors for the next maskBitCnt columns - - // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols) - for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) { - SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], - lastMaskShiftOut[ei], ei, maskBitCnt) ; - - int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index - SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], - lastMaskShiftOut[ei2], ei2, maskBitCnt) ; - } - -#ifdef DEBUG - if (printDebug && maskIndex == 0) { - cout << "The masks for entry 1: " << endl - << getBinaryStr(maskArr[0][rsArr[1]], 32) << endl - << getBinaryStr(currMaskVecLow.VENTRYI[1], 32) << endl ; - - } -#endif // #ifdef DEBUG - - // iterate over mask bit indices and columns - for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) { - - VECTOR maskV ; - VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ; - - VECTOR testData ; - testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE, - maskV.VTYPE) ; - - VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ; - VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ; - -#ifdef DEBUG - if (printDebug && maskIndex == 0) { - cout << "The mask for entry 1, mbi=" << mbi << ": " - << getBinaryStr(maskV.VENTRYI[1], 32) << endl ; - - } -#endif // #ifdef DEBUG - -#ifdef CHECK_MASK_CORRECTNESS - - int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ; - int lastRowIndex = min(col-1, numRowsToProcess-1) ; - - for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) { - int currRow = beginRowIndex + ri ; - int currCol = col - ri + firstRowIndex ; - - char hapChar = tc.hap[currCol-1] ; - char rsChar = tc.rs[currRow-1] ; - - bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ; - - if ((bool) testData.VENTRYI[ri] != match) { - cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag - << " (" << currRow << ", " << currCol << ")" << endl ; - - cout << "The chars are: " << hapChar << " and " << rsChar << endl ; - cout << "The selected value is: " << testData.VENTRYI[ri] << endl ; - - exit(0) ; - } - } -#endif // #ifdef CHECK_MASK_CORRECTNESS - - if (diag < COLS) - ++col ; - - } // mbi - } // maskIndex - - beginRowIndex += VEC_ENTRY_CNT ; - } // end of stripe - - //cout << "Finished validating entry " << endl ; -} - -#ifdef HMM_MASK_MAIN -int main () { - - #define BATCH_SIZE 10000 - - ConvertChar::init() ; - - testcase* tcBatch = new testcase[BATCH_SIZE] ; - - int numBatches = 0 ; - int numRead = 0 ; - - const int DEBUG_TC = -1 ; - - FILE* ifp = stdin ; - - do { - numRead = 0 ; - - while (numRead < BATCH_SIZE) { - if (read_testcase(tcBatch+numRead, ifp) < 0) - break ; - - ++numRead ; - } - - if (numRead == 0) - break ; - - for (int ti=0; ti < numRead; ++ti) { - - int tcID = numBatches * BATCH_SIZE + ti ; - test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ; - } - - ++numBatches ; - } while (numRead == BATCH_SIZE) ; - - fclose(ifp) ; - - delete[] tcBatch ; - - return 0 ; -} -#endif diff --git a/PairHMM_JNI/libJNILoglessPairHMM.so b/PairHMM_JNI/libJNILoglessPairHMM.so deleted file mode 100755 index 9b6b8e693..000000000 Binary files a/PairHMM_JNI/libJNILoglessPairHMM.so and /dev/null differ diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index 432a528bb..a2dbff785 100644 --- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -5,20 +5,13 @@ #define ENABLE_ASSERTIONS 1 #define DO_PROFILING 1 -#define DEBUG 1 +//#define DEBUG 1 //#define DEBUG0_1 1 //#define DEBUG3 1 #include "template.h" #include "utils.h" - -#include "define-float.h" -#include "avx_function_prototypes.h" - -#include "define-double.h" -#include "avx_function_prototypes.h" - using namespace std; class LoadTimeInitializer @@ -33,32 +26,18 @@ class LoadTimeInitializer m_sumSquareNumHaplotypes = 0; m_sumNumTestcases = 0; m_sumSquareNumTestcases = 0; + m_maxNumTestcases = 0; m_num_invocations = 0; + m_filename_to_fptr.clear(); - if(is_avx_supported()) - { - cout << "Using AVX accelerated implementation of PairHMM\n"; - g_compute_full_prob_float = compute_full_prob_avxs; - g_compute_full_prob_double = compute_full_prob_avxd; - } - else - { - cout << "Using un-vectorized C++ implementation of PairHMM\n"; - g_compute_full_prob_float = compute_full_prob; - g_compute_full_prob_double = compute_full_prob; - } + initialize_function_pointers(); cout.flush(); - //if(is_sse42_supported()) - //{ - //g_compute_full_prob_float = compute_full_prob_avxs; - //g_compute_full_prob_double = compute_full_prob_avxd; - //} } void print_profiling() { double mean_val; cout <<"Invocations : "<::iterator mI = m_filename_to_fptr.find(filename); + ofstream* fptr = 0; + if(mI == m_filename_to_fptr.end()) + { + m_filename_to_fptr[filename] = new ofstream(); + fptr = m_filename_to_fptr[filename]; + fptr->open(filename.c_str(), to_append ? ios::app : ios::out); + assert(fptr->is_open()); + } + else + fptr = (*mI).second; + //ofstream fptr; + //fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + (*fptr) << s; + if(add_newline) + (*fptr) << "\n"; + //fptr.close(); + } + void debug_close() + { + for(map::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end(); + mB != mE;mB++) + { + (*mB).second->close(); + delete (*mB).second; + } + m_filename_to_fptr.clear(); } jfieldID m_readBasesFID; jfieldID m_readQualsFID; @@ -89,8 +93,10 @@ class LoadTimeInitializer double m_sumSquareNumHaplotypes; double m_sumNumTestcases; double m_sumSquareNumTestcases; + unsigned m_maxNumTestcases; unsigned m_num_invocations; - + private: + map m_filename_to_fptr; }; LoadTimeInitializer g_load_time_initializer; @@ -174,10 +180,10 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadL tc.c[i] = (int)overallGCPArray[i]; } - double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result_avxd = g_compute_full_prob_double(&tc, 0); double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); #ifdef DEBUG - debug_dump("return_values_jni.txt",to_string(result),true); + g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(result),true); #endif @@ -255,7 +261,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray); #ifdef DEBUG3 for(unsigned k=0;kGetArrayLength(haplotypeBases);++k) - debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); + g_load_time_initializer.debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); #endif } @@ -310,11 +316,11 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai #ifdef DEBUG3 for(unsigned j=0;j g_load_time_initializer.m_maxNumTestcases ? numTestCases + : g_load_time_initializer.m_maxNumTestcases; ++(g_load_time_initializer.m_num_invocations); #endif +#ifdef DEBUG + g_load_time_initializer.debug_close(); +#endif +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose + (JNIEnv* env, jobject thisObject) +{ +#ifdef DO_PROFILING + g_load_time_initializer.print_profiling(); +#endif +#ifdef DEBUG + g_load_time_initializer.debug_close(); +#endif } diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h index 1fff9fa5a..b50f6a5a9 100644 --- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -70,6 +70,14 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniClose + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose + (JNIEnv *, jobject); + #ifdef __cplusplus } #endif diff --git a/PairHMM_JNI/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc index a3b5e0ad5..3030b640e 100644 --- a/PairHMM_JNI/pairhmm-1-base.cc +++ b/PairHMM_JNI/pairhmm-1-base.cc @@ -5,12 +5,6 @@ #include "template.h" #include "utils.h" -#include "define-float.h" -#include "avx_function_prototypes.h" - -#include "define-double.h" -#include "avx_function_prototypes.h" - using namespace std; class LoadTimeInitializer { @@ -36,16 +30,7 @@ int main(int argc, char** argv) if(argc >= 3 && string(argv[2]) == "1") use_old_read_testcase = true; - if(true) - { - g_compute_full_prob_double = GEN_INTRINSIC(compute_full_prob_avx, d); - g_compute_full_prob_float = GEN_INTRINSIC(compute_full_prob_avx, s); - } - else - { - g_compute_full_prob_double = compute_full_prob; - g_compute_full_prob_float = compute_full_prob; - } + initialize_function_pointers(); std::ifstream ifptr; FILE* fptr = 0; @@ -86,70 +71,5 @@ int main(int argc, char** argv) else ifptr.close(); return 0; - -#if 0 - float result[BATCH_SIZE], result_avxf; - double result_avxd; - struct timeval start, end; - long long aggregateTimeRead = 0L; - long long aggregateTimeCompute = 0L; - long long aggregateTimeWrite = 0L; - - bool noMoreData = false; - int count =0; - while (!noMoreData) - { - int read_count = BATCH_SIZE; - gettimeofday(&start, NULL); - for (int b=0;b(&tc[b]); - -#ifdef RUN_HYBRID -#define MIN_ACCEPTED 1e-28f - if (result_avxf < MIN_ACCEPTED) { - count++; - result_avxd = compute_full_prob(&tc[b]); - result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); - } - else - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); -#endif - -#ifndef RUN_HYBRID - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); -#endif - - } - gettimeofday(&end, NULL); - aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); - - gettimeofday(&start, NULL); - for (int b=0;b #include +//#define DEBUG +#define MUSTAFA +#define KARTHIK + /* template string getBinaryStr (T val, int numBitsToWrite) { @@ -17,8 +21,9 @@ string getBinaryStr (T val, int numBitsToWrite) { return oss.str() ; } */ +#ifdef MUSTAFA -void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { +void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { const int maskBitCnt = MAIN_TYPE_SIZE ; @@ -52,7 +57,7 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i } -void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { +void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { for (int ri=0; ri < numRowsToProcess; ++ri) { rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; @@ -63,6 +68,7 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA } } + #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ @@ -71,33 +77,43 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA } -void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { +void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { for (int ei=0; ei < AVX_LENGTH/2; ++ei) { SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], - lastMaskShiftOut[ei], ei, maskBitCnt) ; - + lastMaskShiftOut[ei], ei, maskBitCnt) ; + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], - lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; } } + //void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { -inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { //#define computeDistVec() { _256_TYPE maskV ; VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; +#ifdef DEBUGG + long long *temp1 = (long long *)(&maskV); + double *temp2 = (double *)(&distm); + double *temp3 = (double *)(&_1_distm); + printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); +#endif + distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; + + _mm_empty(); } @@ -120,8 +136,9 @@ struct HmmData { } ; */ +#endif // MUSTAFA -template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +template void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) { NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); @@ -157,18 +174,14 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; - *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; - *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; -#ifdef DEBUG3 - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true); -#endif - //*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - //*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + #ifdef KARTHIK + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; + #else + *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + #endif + } NUMBER *ptr_distm1D = (NUMBER *)distm1D; @@ -176,14 +189,11 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS { int _q = tc->q[r-1] & 127; ptr_distm1D[r-1] = ctx.ph2pr[_q]; -#ifdef DEBUG3 - debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true); -#endif } } -template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( +template inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)( int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, @@ -200,19 +210,18 @@ template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION) NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); -#define TRISTATE_CORRECTION_FACTOR 3.0 - UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR); - + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0); /* compare rs and N */ - //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); - //rsN.d = VEC_CMP_EQ(N_packed256, rs); - +#ifndef MUSTAFA + rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); + rsN.d = VEC_CMP_EQ(N_packed256, rs); +#endif distm = distm1D[i]; - _1_distm = VEC_SUB(packed1.d, distm); -#ifndef DO_NOT_USE_TRISTATE_CORRECTION - distm = VEC_DIV(distm, packed3.d); -#endif - + _1_distm = VEC_SUB(packed1.d, distm); + + #ifdef KARTHIK + distm = VEC_DIV(distm, packed3.d); + #endif /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ M_t_2.d = VEC_SET1_VAL(zero); X_t_2.d = VEC_SET1_VAL(zero); @@ -234,7 +243,7 @@ template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION) -inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, +inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, _256_TYPE distm, _256_TYPE _1_distm) { UNION_TYPE hapN, rshap; @@ -263,12 +272,21 @@ inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcas } -inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel) { /* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */ M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel); + +#ifdef DEBUG + double *temp1 = (double *)(&pGAPM); + double *temp2 = (double *)(&pMM); + double *temp3 = (double *)(&distmSel); + printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); + //printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]); + printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]); +#endif M_t_y = M_t; /* Compute X_t */ @@ -278,7 +296,7 @@ inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_ Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY)); } -template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) { _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; @@ -306,54 +324,51 @@ template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t int remainingRows = (ROWS-1) % AVX_LENGTH; int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); +#ifdef MUSTAFA const int maskBitCnt = MAIN_TYPE_SIZE ; const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; - GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; char rsArr[AVX_LENGTH] ; MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; - - GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, +#endif + GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); //for (int __ii=0; __ii < 10; ++__ii) for (int i=0;i NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t int i = strip_cnt-1; { //STRIP_INITIALIZATION - GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM , + GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM , p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM); if (remainingRows==0) remainingRows=AVX_LENGTH; - - GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut, - i*AVX_LENGTH+1, remainingRows) ; - +#ifdef MUSTAFA + GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ; +#endif _256_TYPE sumM, sumX; sumM = VEC_SET1_VAL(zero); sumX = VEC_SET1_VAL(zero); @@ -382,24 +396,25 @@ template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t for (int d=1;d #include "headers.h" -#include -#include -#include #include "template.h" +#include "vector_defs.h" -#include "define-float.h" -#include "shift_template.c" -#include "avx_function_prototypes.h" -#include "define-double.h" -#include "shift_template.c" -#include "avx_function_prototypes.h" +#define BATCH_SIZE 10 +#define RUN_HYBRID -#define BATCH_SIZE 10000 -//#define RUN_HYBRID - -//uint8_t ConvertChar::conversionTable[255] ; int thread_level_parallelism_enabled = false ; double getCurrClk() { @@ -26,11 +15,45 @@ double getCurrClk() { return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } +void print128b_F(__m128 x) +{ + float *p = (float *)(&x); + for (int i=3;i>=0;i--) + printf("%f ", *(p+i)); + printf("\n"); +} + +void print128b_D(__m128d x) +{ + double *p = (double *)(&x); + for (int i=1;i>=0;i--) + printf("%f ", *(p+i)); + printf("\n"); +} int main() { - - testcase tc[BATCH_SIZE]; +/* + IF_128f x; + x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0); + IF_32 shiftIn, shiftOut; + shiftIn.f = 5.0f; + print128b_F(x.f); + GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut); + print128b_F(x.f); + + IF_128d y; + y.f = _mm_set_pd(10.0, 11.0); + IF_64 shiftInd, shiftOutd; + shiftInd.f = 12.0; + print128b_D(y.f); + GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd); + print128b_D(y.f); + + exit(0); +*/ + + testcase tc[BATCH_SIZE]; float result[BATCH_SIZE], result_avxf; double result_avxd; //struct timeval start, end; @@ -41,12 +64,12 @@ int main() // Need to call it once to initialize the static array ConvertChar::init() ; + - - char* ompEnvVar = getenv("OMP_NUM_THREADS") ; - if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { - thread_level_parallelism_enabled = true ; - } +// char* ompEnvVar = getenv("OMP_NUM_THREADS") ; +// if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { +// thread_level_parallelism_enabled = true ; +// } bool noMoreData = false; int count =0; @@ -56,7 +79,7 @@ int main() lastClk = getCurrClk() ; for (int b=0;b(&tc[b]); + result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)(&tc[b]); #ifdef RUN_HYBRID #define MIN_ACCEPTED 1e-28f if (result_avxf < MIN_ACCEPTED) { + //printf("**************** RUNNING DOUBLE ******************\n"); count++; - result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc[b]); + result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)(&tc[b]); result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); } else diff --git a/PairHMM_JNI/run.sh b/PairHMM_JNI/run.sh index 495a3761b..2bc45efad 100755 --- a/PairHMM_JNI/run.sh +++ b/PairHMM_JNI/run.sh @@ -2,18 +2,21 @@ rm -f *.txt export GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable #-Djava.library.path is needed if you are using JNI_LOGLESS_CACHING, else not needed -java -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar ${GSA_ROOT_DIR}/dist/GenomeAnalysisTK.jar -T HaplotypeCaller \ +java -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar $GSA_ROOT_DIR/dist/GenomeAnalysisTK.jar -T HaplotypeCaller \ -R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \ --I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \ +-I /data/broad/samples/joint_variant_calling/NA12878_high_coverage_alignment/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam \ --dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \ -stand_call_conf 50.0 \ -stand_emit_conf 10.0 \ --pair_hmm_implementation JNI_LOGLESS_CACHING \ +-XL unmapped \ -o output.raw.snps.indels.vcf #--pair_hmm_implementation JNI_LOGLESS_CACHING \ #-I /data/simulated/sim1M_pairs_final.bam \ #-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \ +#-I /data/broad/samples/joint_variant_calling/NA12878_high_coverage_alignment/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam \ +#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly18.fasta \ #-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \ #-R /data/broad/samples/joint_variant_calling/broad_reference/ucsc.hg19.fasta \ #-R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \ diff --git a/PairHMM_JNI/shift_template.c b/PairHMM_JNI/shift_template.c index 90160dcb3..db4a4f6f7 100644 --- a/PairHMM_JNI/shift_template.c +++ b/PairHMM_JNI/shift_template.c @@ -1,9 +1,8 @@ #ifdef PRECISION -#include "template.h" +#ifdef SIMD_TYPE_AVX - -inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut) +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut) { IF_128 xlow , xhigh; @@ -33,7 +32,8 @@ inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE sh x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1); } -inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn) + +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn) { IF_128 xlow , xhigh; @@ -44,10 +44,6 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY /* extract xlow[3] */ IF_128 shiftOutL128; shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1); - /* extract xhigh[3] */ -// IF_MAIN_TYPE shiftOutH; -// shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2); -// shiftOut = shiftOutH.f; /* shift xlow */ xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3); /* shift xhigh */ @@ -63,6 +59,32 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1); } +#endif + +#ifdef SIMD_TYPE_SSE + +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut) +{ + IF_MAIN_TYPE tempIn, tempOut; + tempIn.f = shiftIn; + /* extratc H */ + tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1); + shiftOut = tempOut.f; + /* shift */ + x.i = _mm_slli_si128(x.i, SHIFT_CONST2); + /* insert L */ + x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3); +} + +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn) +{ + IF_MAIN_TYPE temp; temp.f = shiftIn; + /* shift */ + x.i = _mm_slli_si128(x.i, SHIFT_CONST2); + /* insert L */ + x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3); +} #endif +#endif diff --git a/PairHMM_JNI/sse_function_instantiations.cc b/PairHMM_JNI/sse_function_instantiations.cc new file mode 100644 index 000000000..38686ada2 --- /dev/null +++ b/PairHMM_JNI/sse_function_instantiations.cc @@ -0,0 +1,18 @@ +#include "template.h" + +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX + +#define SIMD_TYPE sse +#define SIMD_TYPE_SSE + +#include "define-sse-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-sse-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +template double compute_full_prob_ssed(testcase* tc, double* nextlog); +template float compute_full_prob_sses(testcase* tc, float* nextlog); diff --git a/PairHMM_JNI/template.h b/PairHMM_JNI/template.h index 52a4e4650..0ba4f843d 100644 --- a/PairHMM_JNI/template.h +++ b/PairHMM_JNI/template.h @@ -25,12 +25,25 @@ typedef union __attribute__((aligned(32))) { ALIGNED __m256i ALIGNED i; } ALIGNED mix_F ALIGNED; +typedef union __attribute__((aligned(32))) { + ALIGNED __m128 ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED float ALIGNED f[4]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_F128 ALIGNED; + typedef union ALIGNED { __m128i vec ; __m128 vecf ; uint32_t masks[4] ; } MaskVec_F ; +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint32_t masks[2] ; +} MaskVec_F128 ; + typedef union ALIGNED { ALIGNED __m128i ALIGNED i; @@ -50,12 +63,25 @@ typedef union __attribute__((aligned(32))) { ALIGNED __m256i ALIGNED i; } ALIGNED mix_D ALIGNED; +typedef union __attribute__((aligned(32))) { + ALIGNED __m128d ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED double ALIGNED f[2]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_D128 ALIGNED; + typedef union ALIGNED { __m128i vec ; __m128d vecf ; uint64_t masks[2] ; } MaskVec_D ; +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint64_t masks[1] ; +} MaskVec_D128 ; + typedef union ALIGNED { ALIGNED __m128i ALIGNED i; @@ -120,7 +146,6 @@ struct Context }; - typedef struct { int rslen, haplen; @@ -131,24 +156,11 @@ typedef struct int *irs; } testcase; - -template -std::string to_string(T obj) -{ - std::stringstream ss; - std::string ret_string; - ss.clear(); - ss << std::scientific << obj; - ss >> ret_string; - ss.clear(); - return ret_string; -} -void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); - int normalize(char c); -int read_testcase(testcase *tc, FILE* ifp); -int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); +int read_testcase(testcase *tc, FILE* ifp=0); + +#define MIN_ACCEPTED 1e-28f #define NUM_DISTINCT_CHARS 5 #define AMBIG_CHAR 4 @@ -176,7 +188,6 @@ public: }; - #endif diff --git a/PairHMM_JNI/utils.cc b/PairHMM_JNI/utils.cc index 5b91fc7e9..94e2144c0 100644 --- a/PairHMM_JNI/utils.cc +++ b/PairHMM_JNI/utils.cc @@ -1,5 +1,7 @@ #include "headers.h" #include "template.h" +#include "utils.h" +#include "vector_defs.h" uint8_t ConvertChar::conversionTable[255]; float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0; @@ -31,6 +33,30 @@ bool is_sse42_supported() return ((ecx >> 20)&1) == 1; } +void initialize_function_pointers() +{ + //if(is_avx_supported()) + if(false) + { + cout << "Using AVX accelerated implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob_avxs; + g_compute_full_prob_double = compute_full_prob_avxd; + } + else + if(is_sse42_supported()) + { + cout << "Using SSE4.2 accelerated implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob_sses; + g_compute_full_prob_double = compute_full_prob_ssed; + } + else + { + cout << "Using un-vectorized C++ implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob; + g_compute_full_prob_double = compute_full_prob; + } +} + int normalize(char c) { return ((int) (c - 33)); @@ -214,12 +240,3 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat) return tokens.size(); } -void debug_dump(string filename, string s, bool to_append, bool add_newline) -{ - ofstream fptr; - fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); - fptr << s; - if(add_newline) - fptr << "\n"; - fptr.close(); -} diff --git a/PairHMM_JNI/utils.h b/PairHMM_JNI/utils.h index 8c244333e..8da8269b8 100644 --- a/PairHMM_JNI/utils.h +++ b/PairHMM_JNI/utils.h @@ -1,7 +1,22 @@ #ifndef PAIRHMM_UTIL_H #define PAIRHMM_UTIL_H -#define MIN_ACCEPTED 1e-28f + +template +std::string to_string(T obj) +{ + std::stringstream ss; + std::string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + +int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); + bool is_avx_supported(); bool is_sse42_supported(); extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log); @@ -9,5 +24,5 @@ extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_lo void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline); template NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0); - +void initialize_function_pointers(); #endif diff --git a/PairHMM_JNI/vector_defs.h b/PairHMM_JNI/vector_defs.h new file mode 100644 index 000000000..c89f7f932 --- /dev/null +++ b/PairHMM_JNI/vector_defs.h @@ -0,0 +1,26 @@ +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX +#undef SIMD_TYPE_SSE + +#define SIMD_TYPE avx +#define SIMD_TYPE_AVX + +#include "define-float.h" +#include "vector_function_prototypes.h" + +#include "define-double.h" +#include "vector_function_prototypes.h" + +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX + +#define SIMD_TYPE sse +#define SIMD_TYPE_SSE + +#include "define-sse-float.h" +#include "vector_function_prototypes.h" + +#include "define-sse-double.h" +#include "vector_function_prototypes.h" + + diff --git a/PairHMM_JNI/vector_function_prototypes.h b/PairHMM_JNI/vector_function_prototypes.h new file mode 100644 index 000000000..2593986a6 --- /dev/null +++ b/PairHMM_JNI/vector_function_prototypes.h @@ -0,0 +1,19 @@ +void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); +void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); +void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); +void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); +void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); +void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen); +template void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D); +template void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)( + int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, + _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , + _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM); +_256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, + _256_TYPE distm, _256_TYPE _1_distm); +void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, + UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, + _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel); +template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL); + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 82015d153..b785b8d21 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -1086,7 +1086,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO (open & close) at the walker, not the engine. - //likelihoodCalculationEngine.close(); + likelihoodCalculationEngine.close(); logger.info("Ran local assembly on " + result + " active regions"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 0626f2268..04e64186f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -89,4 +89,6 @@ public interface LikelihoodCalculationEngine { */ public Map computeReadLikelihoods(AssemblyResultSet assemblyResultSet, Map> perSampleReadList); + + public void close(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 0a82e1997..5ab49b531 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -165,8 +165,10 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation } } + @Override public void close() { if ( likelihoodsStream != null ) likelihoodsStream.close(); + pairHMMThreadLocal.get().close(); } private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java index a80800f82..ca3db5ed3 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java @@ -66,7 +66,8 @@ import java.util.Map; */ public class JNILoglessPairHMM extends LoglessPairHMM { - private static final boolean debug = true; //simulates ifdef + private static final boolean debug = false; //simulates ifdef + private static final boolean verify = debug || false; //simulates ifdef private static final boolean debug0_1 = false; //simulates ifdef private static final boolean debug1 = false; //simulates ifdef private static final boolean debug2 = false; @@ -90,6 +91,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } private native void jniGlobalInit(Class readDataHolderClass, Class haplotypeDataHolderClass); + private native void jniClose(); private static boolean isJNILoglessPairHMMLibraryLoaded = false; @@ -104,6 +106,13 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } } + @Override + public void close() + { + jniClose(); + debugClose(); + } + //Used to test parts of the compute kernel separately private native void jniInitialize(final int readMaxLength, final int haplotypeMaxLength); private native static void jniInitializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP); @@ -126,7 +135,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { @Override public void initialize(final int readMaxLength, final int haplotypeMaxLength) { - if(debug) + if(verify) super.initialize(readMaxLength, haplotypeMaxLength); if(debug3) { @@ -150,14 +159,14 @@ public class JNILoglessPairHMM extends LoglessPairHMM { public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { // (re)initialize the pairHMM only if necessary - final int readMaxLength = debug ? findMaxReadLength(reads) : 0; - final int haplotypeMaxLength = debug ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; - if(debug) + final int readMaxLength = verify ? findMaxReadLength(reads) : 0; + final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; + if(verify) { if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } if ( ! initialized ) - throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug mode"); + throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); } int readListSize = reads.size(); int alleleHaplotypeMapSize = alleleHaplotypeMap.size(); @@ -221,7 +230,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]); ++idx; } - if(debug) + if(verify) { //to compare values likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); @@ -273,6 +282,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } ++numComputeLikelihoodCalls; //if(numComputeLikelihoodCalls == 5) + //jniClose(); //System.exit(0); return likelihoodMap; } @@ -299,11 +309,11 @@ public class JNILoglessPairHMM extends LoglessPairHMM { //} //System.out.println("#### END STACK TRACE ####"); // - if(debug1) - jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length, - readBases, haplotypeBases, readQuals, - insertionGOP, deletionGOP, overallGCP, - hapStartIndex); + if(debug1) + jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length, + readBases, haplotypeBases, readQuals, + insertionGOP, deletionGOP, overallGCP, + hapStartIndex); boolean doInitialization = (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length); if (doInitialization) { @@ -358,8 +368,8 @@ public class JNILoglessPairHMM extends LoglessPairHMM { ++numCalls; //if(numCalls == 100) //{ - //debugClose(); - //System.exit(0); + //debugClose(); + //System.exit(0); //} return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index 3b4498776..7b09fe047 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -280,4 +280,7 @@ public abstract class PairHMM { return Math.min(haplotype1.length, haplotype2.length); } + + //Called at the end of all HC calls + public void close() { ; } }