From 5fab96b7ee5181e0164ccef6a135952f58726dad Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Tue, 14 Jan 2014 17:26:55 -0800 Subject: [PATCH] First import of AVX-JNI to git --- .gitignore | 6 + CHEATSHEET | 2 + Makefile | 43 ++ README | 41 ++ define-double.h | 154 ++++++ define-float.h | 155 ++++++ hmm_mask.cc | 484 ++++++++++++++++++ ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 146 ++++++ ...te_sting_utils_pairhmm_JNILoglessPairHMM.h | 61 +++ pairhmm-1-base.cc | 276 ++++++++++ pairhmm-template-kernel.cc | 399 +++++++++++++++ pairhmm-template-main.cc | 114 +++++ shift_template.c | 68 +++ template.h | 226 ++++++++ 14 files changed, 2175 insertions(+) create mode 100644 .gitignore create mode 100644 CHEATSHEET create mode 100644 Makefile create mode 100644 README create mode 100644 define-double.h create mode 100644 define-float.h create mode 100644 hmm_mask.cc create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h create mode 100644 pairhmm-1-base.cc create mode 100644 pairhmm-template-kernel.cc create mode 100644 pairhmm-template-main.cc create mode 100644 shift_template.c create mode 100644 template.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..16e86f25d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.svn +*.o +*.so +tests +.deps +hmm_Mohammad diff --git a/CHEATSHEET b/CHEATSHEET new file mode 100644 index 000000000..f61bfdd7f --- /dev/null +++ b/CHEATSHEET @@ -0,0 +1,2 @@ +/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX; ./pairhmm-template-main.o < mediumTest.in > med.out; head -n -5 med.out > medium.out; lua checkLikelihoods.lua medium.out ~/hmm/mediumTest.out 6e-5 + diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..df5dbc463 --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +#OMPCFLAGS=-fopenmp +#OMPLDFLAGS=-lgomp + +#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas +#CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas + +JAVA_ROOT=/opt/jdk1.7.0_25/ +JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux + +CFLAGS=-g -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX + +CXXFLAGS=$(CFLAGS) +CC=icc +CXX=icc + +LDFLAGS=-lm $(OMPLDFLAGS) + +#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev +BIN:=libJNILoglessPairHMM.so #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev + +#SOURCES=pairhmm-1-base.cc input.cc +SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc +OBJECTS=$(SOURCES:.cc=.o) +DEPDIR=.deps +DF=$(DEPDIR)/$(*).d + +all: $(BIN) + +-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) + +pairhmm-1-base: pairhmm-1-base.o input.o + $(CXX) -o $@ $^ $(LDFLAGS) + +libJNILoglessPairHMM.so: $(OBJECTS) + $(CXX) -shared -o libJNILoglessPairHMM.so $(OBJECTS) + +%.o: %.cc + @mkdir -p $(DEPDIR) + $(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $< + + +clean: + rm -f $(BIN) *.o diff --git a/README b/README new file mode 100644 index 000000000..e16df5a8d --- /dev/null +++ b/README @@ -0,0 +1,41 @@ +How to compile: +/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX + +------------------------------------ +How to run: +./pairhmm-template-main.o < ./test_data/largeTest.in > large.out +./pairhmm-template-main.o < ./test_data/mediumTest.in > medium.out +------------------------------------- +Results: +------------------------------------- +MEDIUM FLOAT: +AVX Read Time: 8019020 +AVX Compute Time: 29270238 +AVX Write Time: 242170 +AVX Total Time: 37531428 +# Double called: 0 +------------------------------------- +MEDIUM HYBRID: +AVX Read Time: 7596629 +AVX Compute Time: 29979220 +AVX Write Time: 248485 +AVX Total Time: 37824334 +# Double called: 20329 +------------------------------------- +LARGE FLOAT: +AVX Read Time: 111888586 +AVX Compute Time: 408131806 +AVX Write Time: 3477892 +AVX Total Time: 523498284 +# Double called: 0 +------------------------------------- +LARGE HYBRID: +AVX Read Time: 109728994 +AVX Compute Time: 419475317 +AVX Write Time: 3575364 +AVX Total Time: 532779675 +# Double called: 310042 + + + + diff --git a/define-double.h b/define-double.h new file mode 100644 index 000000000..b2de7520a --- /dev/null +++ b/define-double.h @@ -0,0 +1,154 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO + #undef VEC_OR + #undef VEC_ADD + #undef VEC_SUB + #undef VEC_MUL + #undef VEC_BLEND + #undef VEC_BLENDV + #undef VEC_CAST_256_128 + #undef VEC_EXTRACT_128 + #undef VEC_EXTRACT_UNIT + #undef VEC_SET1_VAL128 + #undef VEC_MOVE + #undef VEC_CAST_128_256 + #undef VEC_INSERT_VAL + #undef VEC_CVT_128_256 + #undef VEC_SET1_VAL + #undef VEC_POPCVT_CHAR + #undef VEC_LDPOPCVT_CHAR + #undef VEC_CMP_EQ + #undef VEC_SET_LSE + #undef SHIFT_HAP + #undef print256b + #undef MASK_VEC + #undef VEC_SSE_TO_AVX + #undef VEC_SHIFT_LEFT_1BIT + #undef MASK_ALL_ONES + #undef COMPARE_VECS + #undef _256_INT_TYPE +#endif + +#define PRECISION d +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 8 +#define SHIFT_CONST2 1 +#define SHIFT_CONST3 8 +#define _128_TYPE __m128d +#define _256_TYPE __m256d +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 4 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF +#define MASK_VEC MaskVec_D + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_pd() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_pd(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castpd256_pd128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_pd (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_pd(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_sd(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castpd128_pd256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_pd(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_pd(zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define print256b(__v1) \ + print256bDP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castpd128_pd256(__vsLow) ; \ + __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + double* ptr1 = (double*) (&__v1) ; \ + double* ptr2 = (double*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Double Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } diff --git a/define-float.h b/define-float.h new file mode 100644 index 000000000..4cba9a179 --- /dev/null +++ b/define-float.h @@ -0,0 +1,155 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE +#endif + +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 12 +#define SHIFT_CONST2 3 +#define SHIFT_CONST3 4 +#define _128_TYPE __m128 +#define _256_TYPE __m256 +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 8 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castps256_ps128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_ps (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_ps(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_ss(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castps128_ps256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_ps(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lasts(__v1, __val.f); + +#define print256b(__v1) \ + print256bFP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + float* ptr1 = (float*) (&__v1) ; \ + float* ptr2 = (float*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Float Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } + diff --git a/hmm_mask.cc b/hmm_mask.cc new file mode 100644 index 000000000..aace7acf2 --- /dev/null +++ b/hmm_mask.cc @@ -0,0 +1,484 @@ +#include +#include +#include +#include + + +// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static + + +#include + +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std ; + +#define CHECK_MASK_CORRECTNESS + +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} + + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; +} testcase; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +class ConvertChar { + + static uint8_t conversionTable[255] ; + + +public: + + static void init() { + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +} ; + +uint8_t ConvertChar::conversionTable[255] ; + + + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, ifp); + if (read == -1) + return -1; + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + // Convert hap and rs to 3 bits + + /* + for (int ci=0; ci < tc->haplen; ++ci) + tc->hap[ci] = ConvertChar::get(tc->hap[ci]) ; + + for (int ci=0; ci < tc->rslen; ++ci) + tc->rs[ci] = ConvertChar::get(tc->rs[ci]) ; + */ + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + } + + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + + + + +#define ALIGN __attribute__ ((aligned(32))) + + +typedef union ALIGN { + //__m256i vi; + __m128 vf ; + __m128i vi ; + __m128d vd; + + uint8_t b[16]; + //uint16_t hw[8] ; + uint32_t w[4] ; + uint64_t dw[2] ; + + float f[4] ; + //double d[2] ; + +} v128 ; + +typedef union ALIGN { + __m256i vi; + __m256 vf ; + __m256d vd; + //__m128i vi128[2] ; + + uint8_t b[32]; + //uint16_t hw[16] ; + uint32_t w[8] ; + uint64_t dw[4] ; + + float f[8] ; + double d[4] ; + +} v256 ; + + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +#define VEC_ENTRY_CNT 8 +#define VEC_LEN 256 +#define VTYPE vf +#define VTYPEI vi +#define VENTRY f +#define VENTRYI w +#define MTYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF + + +#define VECTOR v256 +#define VECTOR_SSE v128 + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define SET_VEC_ONES(__vec) \ + __vec = _mm256_set1_epi32(0xFFFFFFFF) + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VECS_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + + +#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \ + uint64_t __mask = ; \ + uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \ + __shiftWord <<= (64-cnt) ; \ + __vs = _mm_slri_epi64(__vs, __cnt) ; \ + __vs.dw[0] |= __shiftWord ; \ + } + + +#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __mask >>= __shiftBy ; \ + __mask |= __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ + } + + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs, + MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + maskArr[mIndex][0] |= bitMask ; + maskArr[mIndex][1] |= bitMask ; + maskArr[mIndex][2] |= bitMask ; + maskArr[mIndex][3] |= bitMask ; + } + + //cout << hapChar << " " << mIndex << " " << getBinaryStr(bitMask, 32) + // << endl ; + //cout << getBinaryStr(maskArr[0][hapChar],32) << endl ; + //exit(0) ; + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + + + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + + +void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { + + int ROWS = tc.rslen + 1 ; + int COLS = tc.haplen + 1 ; + + // only for testing + VECTOR mismatchData, matchData ; + //SET_VEC_ZERO(mismatchData.VTYPE) ; + //SET_VEC_ONES(matchData.VTYPEI) ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) { + matchData.VENTRY[ei] = 1.0 ; + mismatchData.VENTRY[ei] = 0.0 ; + } + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ; + +#ifdef DEBUG + if (printDebug) { + cout << "The first 32 hap chars are: " ; + for (int i=0; i < 32; ++i) { + cout << tc.hap[i] ; + } + cout << endl ; + + cout << "Masks computed for A, C, T, G, N are: " << endl ; + cout << getBinaryStr(maskArr[0][0], 32) << endl ; + cout << getBinaryStr(maskArr[0][1], 32) << endl ; + cout << getBinaryStr(maskArr[0][2], 32) << endl ; + cout << getBinaryStr(maskArr[0][3], 32) << endl ; + cout << getBinaryStr(maskArr[0][4], 32) << endl ; + } +#endif // #ifdef DEBUG + + int beginRowIndex = 1 ; + while (beginRowIndex < ROWS) { + + int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ; + + char rsArr[VEC_ENTRY_CNT] ; + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors + VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3 + VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7 + + MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) + lastMaskShiftOut[ei] = 0 ; + + int col = 1 ; + int diag = 1 ; + for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) { + // set up the mask vectors for the next maskBitCnt columns + + // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols) + for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The masks for entry 1: " << endl + << getBinaryStr(maskArr[0][rsArr[1]], 32) << endl + << getBinaryStr(currMaskVecLow.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + + // iterate over mask bit indices and columns + for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) { + + VECTOR maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ; + + VECTOR testData ; + testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE, + maskV.VTYPE) ; + + VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ; + VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ; + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The mask for entry 1, mbi=" << mbi << ": " + << getBinaryStr(maskV.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + +#ifdef CHECK_MASK_CORRECTNESS + + int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ; + int lastRowIndex = min(col-1, numRowsToProcess-1) ; + + for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) { + int currRow = beginRowIndex + ri ; + int currCol = col - ri + firstRowIndex ; + + char hapChar = tc.hap[currCol-1] ; + char rsChar = tc.rs[currRow-1] ; + + bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ; + + if ((bool) testData.VENTRYI[ri] != match) { + cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag + << " (" << currRow << ", " << currCol << ")" << endl ; + + cout << "The chars are: " << hapChar << " and " << rsChar << endl ; + cout << "The selected value is: " << testData.VENTRYI[ri] << endl ; + + exit(0) ; + } + } +#endif // #ifdef CHECK_MASK_CORRECTNESS + + if (diag < COLS) + ++col ; + + } // mbi + } // maskIndex + + beginRowIndex += VEC_ENTRY_CNT ; + } // end of stripe + + //cout << "Finished validating entry " << endl ; +} + + +int main () { + + #define BATCH_SIZE 10000 + + ConvertChar::init() ; + + testcase* tcBatch = new testcase[BATCH_SIZE] ; + + int numBatches = 0 ; + int numRead = 0 ; + + const int DEBUG_TC = -1 ; + + FILE* ifp = stdin ; + + do { + numRead = 0 ; + + while (numRead < BATCH_SIZE) { + if (read_testcase(tcBatch+numRead, ifp) < 0) + break ; + + ++numRead ; + } + + if (numRead == 0) + break ; + + for (int ti=0; ti < numRead; ++ti) { + + int tcID = numBatches * BATCH_SIZE + ti ; + test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ; + } + + ++numBatches ; + } while (numRead == BATCH_SIZE) ; + + fclose(ifp) ; + + delete[] tcBatch ; + + return 0 ; +} diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc new file mode 100644 index 000000000..80065b8c9 --- /dev/null +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -0,0 +1,146 @@ +#include +#include +#include +#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "template.h" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +using namespace std; + + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +//#define DEBUG3 1 +#define DEBUG 1 + +template +string to_string(T obj) +{ + stringstream ss; + string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} + +void debug_dump(string filename, string s, bool to_append, bool add_newline=true) +{ + ofstream fptr; + fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + fptr << s; + if(add_newline) + fptr << "\n"; + fptr.close(); +} + +#define INT_STORE_ARRAY_SIZE 2048 +int insertionGOPIntArray[INT_STORE_ARRAY_SIZE]; +int deletionGOPIntArray[INT_STORE_ARRAY_SIZE]; +int overallGCPIntArray[INT_STORE_ARRAY_SIZE]; +int readQualsIntArray[INT_STORE_ARRAY_SIZE]; + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( + JNIEnv* env, jobject thisObject, + jint readLength, jint haplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, + jint hapStartIndex + ) +{ + jboolean is_copy = JNI_FALSE; + jbyte* readBasesArray = (env)->GetByteArrayElements(readBases, &is_copy); + jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy); + jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy); + jbyte* insertionGOPArray = (env)->GetByteArrayElements(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy); + jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); + assert(readLength < INT_STORE_ARRAY_SIZE); +#endif + for(unsigned i=0;i(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); +#ifdef DEBUG + debug_dump("return_values_jni.txt",to_string(result),true); +#endif + + + env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT); + env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT); + env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT); + env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT); + env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT); + env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT); + + return 0.0; +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities +(JNIEnv* env, jclass thisObject, + jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP + ) +{} + + JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize +(JNIEnv* env, jobject thisObject, + jint readMaxLength, jint haplotypeMaxLength) +{} + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells( + JNIEnv* env, jobject thisObject, + jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jint hapStartIndex + ) + +{ return 0.0; } diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h new file mode 100644 index 000000000..5408d2f57 --- /dev/null +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -0,0 +1,61 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM */ + +#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#define _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#ifdef __cplusplus +extern "C" { +#endif +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION 3.0 +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch 1L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion 2L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion 3L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitialize + * Signature: (II)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize + (JNIEnv *, jobject, jint, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializeProbabilities + * Signature: ([[D[B[B[B)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities + (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializePriorsAndUpdateCells + * Signature: (ZII[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells + (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint); + + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10 + * Signature: (II[B[B[B[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 + (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); + + +#ifdef __cplusplus +} +#endif +#endif diff --git a/pairhmm-1-base.cc b/pairhmm-1-base.cc new file mode 100644 index 000000000..dacd02536 --- /dev/null +++ b/pairhmm-1-base.cc @@ -0,0 +1,276 @@ +#include +#include +#include +#include + +#include + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +//#define DEBUG + +/* + q: read quality + i: insertion penalty + d: deletion penalty + c: gap continuation penalty +*/ + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; +} testcase; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +int read_testcase(testcase *tc) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, stdin); + if (read == -1) + return -1; + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + } + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +template +struct Context{}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } + double ph2pr[128]; + double INITIAL_CONSTANT; + double LOG10_INITIAL_CONSTANT; + double RESULT_THRESHOLD; +}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } + float ph2pr[128]; + float INITIAL_CONSTANT; + float LOG10_INITIAL_CONSTANT; + float RESULT_THRESHOLD; +}; + +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) +{ + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + + Context ctx; + + NUMBER M[ROWS][COLS]; + NUMBER X[ROWS][COLS]; + NUMBER Y[ROWS][COLS]; + NUMBER p[ROWS][6]; + + p[0][MM] = ctx._(0.0); + p[0][GapM] = ctx._(0.0); + p[0][MX] = ctx._(0.0); + p[0][XX] = ctx._(0.0); + p[0][MY] = ctx._(0.0); + p[0][YY] = ctx._(0.0); + for (r = 1; r < ROWS; r++) + { + int _i = tc->i[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; + p[r][MX] = ctx.ph2pr[_i]; + p[r][XX] = ctx.ph2pr[_c]; + p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + + NUMBER result = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + for (c = 1; c < COLS; c++) + { + char _rs = tc->rs[r-1]; + char _hap = tc->hap[c-1]; + int _q = tc->q[r-1] & 127; + NUMBER distm = ctx.ph2pr[_q]; + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; + } + + for (c = 0; c < COLS; c++) + { + result += M[ROWS-1][c] + X[ROWS-1][c]; + } + + if (before_last_log != NULL) + *before_last_log = result; + + return result; //ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +#define BATCH_SIZE 10000 +#define RUN_HYBRID + +int main() +{ + testcase tc[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + struct timeval start, end; + long long aggregateTimeRead = 0L; + long long aggregateTimeCompute = 0L; + long long aggregateTimeWrite = 0L; + + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; + gettimeofday(&start, NULL); + for (int b=0;b(&tc[b]); + + #ifdef RUN_HYBRID + #define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = compute_full_prob(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + #ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + } + gettimeofday(&end, NULL); + aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + gettimeofday(&start, NULL); + for (int b=0;b +#include +#include + +/* +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} +*/ + +void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = MAIN_TYPE_SIZE ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) + maskArr[mIndex][ci] |= bitMask ; + } + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + +void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { + + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + for (int ei=0; ei < AVX_LENGTH; ++ei) { + lastMaskShiftOut[ei] = 0 ; + } +} + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ + MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { + + for (int ei=0; ei < AVX_LENGTH/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +} + +//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { + +inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { + //#define computeDistVec() { + + _256_TYPE maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; + + distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; + + /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ + + VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; + VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; +} + + +/* +template +struct HmmData { + int ROWS ; + int COLS ; + + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ; + Context ctx ; + testcase* tc ; + _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ; + + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ; + UNION_TYPE rs , rsN ; + _256_TYPE distmSel; + _256_TYPE distm, _1_distm; + +} ; +*/ + +template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +{ + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + for (int s=0;si[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + + *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; + *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; + *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + NUMBER *ptr_distm1D = (NUMBER *)distm1D; + for (int r = 1; r < ROWS; r++) + { + int _q = tc->q[r-1] & 127; + ptr_distm1D[r-1] = ctx.ph2pr[_q]; + } +} + + +template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( + int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, + _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , + _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) +{ + int i = stripIdx; + pGAPM = p_GAPM[i]; + pMM = p_MM[i]; + pMX = p_MX[i]; + pXX = p_XX[i]; + pMY = p_MY[i]; + pYY = p_YY[i]; + + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + + /* compare rs and N */ + //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); + //rsN.d = VEC_CMP_EQ(N_packed256, rs); + + distm = distm1D[i]; + _1_distm = VEC_SUB(packed1.d, distm); + + /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ + M_t_2.d = VEC_SET1_VAL(zero); + X_t_2.d = VEC_SET1_VAL(zero); + + if (i==0) { + M_t_1.d = VEC_SET1_VAL(zero); + X_t_1.d = VEC_SET1_VAL(zero); + Y_t_2.d = VEC_SET_LSE(init_Y); + Y_t_1.d = VEC_SET1_VAL(zero); + } + else { + X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); + M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); + Y_t_2.d = VEC_SET1_VAL(zero); + Y_t_1.d = VEC_SET1_VAL(zero); + } + M_t_1_y = M_t_1; +} + + + +inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, + _256_TYPE distm, _256_TYPE _1_distm) +{ + UNION_TYPE hapN, rshap; + _256_TYPE cond; + IF_32 shiftInHap; + + int *hap_ptr = tc->ihap; + + shiftInHap.i = (d NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +{ + _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; + _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; + _256_TYPE distm1D[MAVX_COUNT]; + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH]; + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; + + struct timeval start, end; + NUMBER result_avx2; + Context ctx; + UNION_TYPE rs , rsN; + HAP_TYPE hap; + _256_TYPE distmSel, distmChosen ; + _256_TYPE distm, _1_distm; + + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + int AVX_COUNT = (ROWS+7)/8; + NUMBER zero = ctx._(0.0); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + _256_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + int remainingRows = (ROWS-1) % AVX_LENGTH; + int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); + + const int maskBitCnt = MAIN_TYPE_SIZE ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + + char rsArr[AVX_LENGTH] ; + MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; + + GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, + ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); + + //for (int __ii=0; __ii < 10; ++__ii) + for (int i=0;i +#include +#include +#include + +#include "template.h" + +#include "define-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#define BATCH_SIZE 10000 +//#define RUN_HYBRID + +uint8_t ConvertChar::conversionTable[255] ; +int thread_level_parallelism_enabled = false ; + +double getCurrClk() { + struct timeval tv ; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + +int main() +{ + testcase tc[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + //struct timeval start, end; + double lastClk = 0.0 ; + double aggregateTimeRead = 0.0; + double aggregateTimeCompute = 0.0; + double aggregateTimeWrite = 0.0; + + // Need to call it once to initialize the static array + ConvertChar::init() ; + + + char* ompEnvVar = getenv("OMP_NUM_THREADS") ; + if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { + thread_level_parallelism_enabled = true ; + } + + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; + + lastClk = getCurrClk() ; + for (int b=0;b(&tc[b]); + + #ifdef RUN_HYBRID + #define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + #ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + } + //gettimeofday(&end, NULL); + aggregateTimeCompute += (getCurrClk() - lastClk) ; + //((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + //gettimeofday(&start, NULL); + lastClk = getCurrClk() ; + for (int b=0;b +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#define MROWS 500 +#define MCOLS 1000 + +#define CAT(X,Y) X####Y +#define GEN_INTRINSIC(X,Y) CAT(X,Y) + +#define ALIGNED __attribute__((aligned(32))) + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256 ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED float ALIGNED f[8]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_F ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128 vecf ; + uint32_t masks[4] ; +} MaskVec_F ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128 ALIGNED f; +} ALIGNED IF_128f ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int ALIGNED i; + ALIGNED float ALIGNED f; +} ALIGNED IF_32 ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256d ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED double ALIGNED f[4]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_D ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128d vecf ; + uint64_t masks[2] ; +} MaskVec_D ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128d ALIGNED f; +} ALIGNED IF_128d ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int64_t ALIGNED i; + ALIGNED double ALIGNED f; +} ALIGNED IF_64 ALIGNED; + +template +struct Context{}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } + double ph2pr[128]; + double INITIAL_CONSTANT; + double LOG10_INITIAL_CONSTANT; + double RESULT_THRESHOLD; +}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + { + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + } + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } + float ph2pr[128]; + float INITIAL_CONSTANT; + float LOG10_INITIAL_CONSTANT; + float RESULT_THRESHOLD; +}; + + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; + int *ihap; + int *irs; +} testcase; + +int normalize(char c); + + +int read_testcase(testcase *tc) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, stdin); + if (read == -1) + return -1; + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + tc->ihap = (int *) malloc(size*sizeof(int)); + tc->irs = (int *) malloc(size*sizeof(int)); + + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; + } + + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +class ConvertChar { + + static uint8_t conversionTable[255] ; + +public: + + static void init() { + assert (NUM_DISTINCT_CHARS == 5) ; + assert (AMBIG_CHAR == 4) ; + + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +} ; + + +#endif + +