From 5fab96b7ee5181e0164ccef6a135952f58726dad Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Tue, 14 Jan 2014 17:26:55 -0800 Subject: [PATCH 1/6] First import of AVX-JNI to git --- .gitignore | 6 + CHEATSHEET | 2 + Makefile | 43 ++ README | 41 ++ define-double.h | 154 ++++++ define-float.h | 155 ++++++ hmm_mask.cc | 484 ++++++++++++++++++ ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 146 ++++++ ...te_sting_utils_pairhmm_JNILoglessPairHMM.h | 61 +++ pairhmm-1-base.cc | 276 ++++++++++ pairhmm-template-kernel.cc | 399 +++++++++++++++ pairhmm-template-main.cc | 114 +++++ shift_template.c | 68 +++ template.h | 226 ++++++++ 14 files changed, 2175 insertions(+) create mode 100644 .gitignore create mode 100644 CHEATSHEET create mode 100644 Makefile create mode 100644 README create mode 100644 define-double.h create mode 100644 define-float.h create mode 100644 hmm_mask.cc create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h create mode 100644 pairhmm-1-base.cc create mode 100644 pairhmm-template-kernel.cc create mode 100644 pairhmm-template-main.cc create mode 100644 shift_template.c create mode 100644 template.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..16e86f25d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.svn +*.o +*.so +tests +.deps +hmm_Mohammad diff --git a/CHEATSHEET b/CHEATSHEET new file mode 100644 index 000000000..f61bfdd7f --- /dev/null +++ b/CHEATSHEET @@ -0,0 +1,2 @@ +/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX; ./pairhmm-template-main.o < mediumTest.in > med.out; head -n -5 med.out > medium.out; lua checkLikelihoods.lua medium.out ~/hmm/mediumTest.out 6e-5 + diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..df5dbc463 --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +#OMPCFLAGS=-fopenmp +#OMPLDFLAGS=-lgomp + +#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas +#CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas + +JAVA_ROOT=/opt/jdk1.7.0_25/ +JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux + +CFLAGS=-g -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX + +CXXFLAGS=$(CFLAGS) +CC=icc +CXX=icc + +LDFLAGS=-lm $(OMPLDFLAGS) + +#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev +BIN:=libJNILoglessPairHMM.so #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev + +#SOURCES=pairhmm-1-base.cc input.cc +SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc +OBJECTS=$(SOURCES:.cc=.o) +DEPDIR=.deps +DF=$(DEPDIR)/$(*).d + +all: $(BIN) + +-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) + +pairhmm-1-base: pairhmm-1-base.o input.o + $(CXX) -o $@ $^ $(LDFLAGS) + +libJNILoglessPairHMM.so: $(OBJECTS) + $(CXX) -shared -o libJNILoglessPairHMM.so $(OBJECTS) + +%.o: %.cc + @mkdir -p $(DEPDIR) + $(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $< + + +clean: + rm -f $(BIN) *.o diff --git a/README b/README new file mode 100644 index 000000000..e16df5a8d --- /dev/null +++ b/README @@ -0,0 +1,41 @@ +How to compile: +/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX + +------------------------------------ +How to run: +./pairhmm-template-main.o < ./test_data/largeTest.in > large.out +./pairhmm-template-main.o < ./test_data/mediumTest.in > medium.out +------------------------------------- +Results: +------------------------------------- +MEDIUM FLOAT: +AVX Read Time: 8019020 +AVX Compute Time: 29270238 +AVX Write Time: 242170 +AVX Total Time: 37531428 +# Double called: 0 +------------------------------------- +MEDIUM HYBRID: +AVX Read Time: 7596629 +AVX Compute Time: 29979220 +AVX Write Time: 248485 +AVX Total Time: 37824334 +# Double called: 20329 +------------------------------------- +LARGE FLOAT: +AVX Read Time: 111888586 +AVX Compute Time: 408131806 +AVX Write Time: 3477892 +AVX Total Time: 523498284 +# Double called: 0 +------------------------------------- +LARGE HYBRID: +AVX Read Time: 109728994 +AVX Compute Time: 419475317 +AVX Write Time: 3575364 +AVX Total Time: 532779675 +# Double called: 310042 + + + + diff --git a/define-double.h b/define-double.h new file mode 100644 index 000000000..b2de7520a --- /dev/null +++ b/define-double.h @@ -0,0 +1,154 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO + #undef VEC_OR + #undef VEC_ADD + #undef VEC_SUB + #undef VEC_MUL + #undef VEC_BLEND + #undef VEC_BLENDV + #undef VEC_CAST_256_128 + #undef VEC_EXTRACT_128 + #undef VEC_EXTRACT_UNIT + #undef VEC_SET1_VAL128 + #undef VEC_MOVE + #undef VEC_CAST_128_256 + #undef VEC_INSERT_VAL + #undef VEC_CVT_128_256 + #undef VEC_SET1_VAL + #undef VEC_POPCVT_CHAR + #undef VEC_LDPOPCVT_CHAR + #undef VEC_CMP_EQ + #undef VEC_SET_LSE + #undef SHIFT_HAP + #undef print256b + #undef MASK_VEC + #undef VEC_SSE_TO_AVX + #undef VEC_SHIFT_LEFT_1BIT + #undef MASK_ALL_ONES + #undef COMPARE_VECS + #undef _256_INT_TYPE +#endif + +#define PRECISION d +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 8 +#define SHIFT_CONST2 1 +#define SHIFT_CONST3 8 +#define _128_TYPE __m128d +#define _256_TYPE __m256d +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 4 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF +#define MASK_VEC MaskVec_D + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_pd() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_pd(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castpd256_pd128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_pd (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_pd(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_sd(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castpd128_pd256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_pd(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_pd(zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define print256b(__v1) \ + print256bDP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castpd128_pd256(__vsLow) ; \ + __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + double* ptr1 = (double*) (&__v1) ; \ + double* ptr2 = (double*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Double Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } diff --git a/define-float.h b/define-float.h new file mode 100644 index 000000000..4cba9a179 --- /dev/null +++ b/define-float.h @@ -0,0 +1,155 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE +#endif + +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 12 +#define SHIFT_CONST2 3 +#define SHIFT_CONST3 4 +#define _128_TYPE __m128 +#define _256_TYPE __m256 +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 8 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castps256_ps128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_ps (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_ps(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_ss(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castps128_ps256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_ps(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lasts(__v1, __val.f); + +#define print256b(__v1) \ + print256bFP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + float* ptr1 = (float*) (&__v1) ; \ + float* ptr2 = (float*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Float Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } + diff --git a/hmm_mask.cc b/hmm_mask.cc new file mode 100644 index 000000000..aace7acf2 --- /dev/null +++ b/hmm_mask.cc @@ -0,0 +1,484 @@ +#include +#include +#include +#include + + +// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static + + +#include + +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std ; + +#define CHECK_MASK_CORRECTNESS + +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} + + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; +} testcase; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +class ConvertChar { + + static uint8_t conversionTable[255] ; + + +public: + + static void init() { + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +} ; + +uint8_t ConvertChar::conversionTable[255] ; + + + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, ifp); + if (read == -1) + return -1; + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + // Convert hap and rs to 3 bits + + /* + for (int ci=0; ci < tc->haplen; ++ci) + tc->hap[ci] = ConvertChar::get(tc->hap[ci]) ; + + for (int ci=0; ci < tc->rslen; ++ci) + tc->rs[ci] = ConvertChar::get(tc->rs[ci]) ; + */ + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + } + + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + + + + +#define ALIGN __attribute__ ((aligned(32))) + + +typedef union ALIGN { + //__m256i vi; + __m128 vf ; + __m128i vi ; + __m128d vd; + + uint8_t b[16]; + //uint16_t hw[8] ; + uint32_t w[4] ; + uint64_t dw[2] ; + + float f[4] ; + //double d[2] ; + +} v128 ; + +typedef union ALIGN { + __m256i vi; + __m256 vf ; + __m256d vd; + //__m128i vi128[2] ; + + uint8_t b[32]; + //uint16_t hw[16] ; + uint32_t w[8] ; + uint64_t dw[4] ; + + float f[8] ; + double d[4] ; + +} v256 ; + + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +#define VEC_ENTRY_CNT 8 +#define VEC_LEN 256 +#define VTYPE vf +#define VTYPEI vi +#define VENTRY f +#define VENTRYI w +#define MTYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF + + +#define VECTOR v256 +#define VECTOR_SSE v128 + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define SET_VEC_ONES(__vec) \ + __vec = _mm256_set1_epi32(0xFFFFFFFF) + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VECS_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + + +#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \ + uint64_t __mask = ; \ + uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \ + __shiftWord <<= (64-cnt) ; \ + __vs = _mm_slri_epi64(__vs, __cnt) ; \ + __vs.dw[0] |= __shiftWord ; \ + } + + +#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __mask >>= __shiftBy ; \ + __mask |= __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ + } + + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs, + MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + maskArr[mIndex][0] |= bitMask ; + maskArr[mIndex][1] |= bitMask ; + maskArr[mIndex][2] |= bitMask ; + maskArr[mIndex][3] |= bitMask ; + } + + //cout << hapChar << " " << mIndex << " " << getBinaryStr(bitMask, 32) + // << endl ; + //cout << getBinaryStr(maskArr[0][hapChar],32) << endl ; + //exit(0) ; + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + + + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + + +void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { + + int ROWS = tc.rslen + 1 ; + int COLS = tc.haplen + 1 ; + + // only for testing + VECTOR mismatchData, matchData ; + //SET_VEC_ZERO(mismatchData.VTYPE) ; + //SET_VEC_ONES(matchData.VTYPEI) ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) { + matchData.VENTRY[ei] = 1.0 ; + mismatchData.VENTRY[ei] = 0.0 ; + } + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ; + +#ifdef DEBUG + if (printDebug) { + cout << "The first 32 hap chars are: " ; + for (int i=0; i < 32; ++i) { + cout << tc.hap[i] ; + } + cout << endl ; + + cout << "Masks computed for A, C, T, G, N are: " << endl ; + cout << getBinaryStr(maskArr[0][0], 32) << endl ; + cout << getBinaryStr(maskArr[0][1], 32) << endl ; + cout << getBinaryStr(maskArr[0][2], 32) << endl ; + cout << getBinaryStr(maskArr[0][3], 32) << endl ; + cout << getBinaryStr(maskArr[0][4], 32) << endl ; + } +#endif // #ifdef DEBUG + + int beginRowIndex = 1 ; + while (beginRowIndex < ROWS) { + + int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ; + + char rsArr[VEC_ENTRY_CNT] ; + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors + VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3 + VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7 + + MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) + lastMaskShiftOut[ei] = 0 ; + + int col = 1 ; + int diag = 1 ; + for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) { + // set up the mask vectors for the next maskBitCnt columns + + // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols) + for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The masks for entry 1: " << endl + << getBinaryStr(maskArr[0][rsArr[1]], 32) << endl + << getBinaryStr(currMaskVecLow.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + + // iterate over mask bit indices and columns + for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) { + + VECTOR maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ; + + VECTOR testData ; + testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE, + maskV.VTYPE) ; + + VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ; + VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ; + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The mask for entry 1, mbi=" << mbi << ": " + << getBinaryStr(maskV.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + +#ifdef CHECK_MASK_CORRECTNESS + + int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ; + int lastRowIndex = min(col-1, numRowsToProcess-1) ; + + for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) { + int currRow = beginRowIndex + ri ; + int currCol = col - ri + firstRowIndex ; + + char hapChar = tc.hap[currCol-1] ; + char rsChar = tc.rs[currRow-1] ; + + bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ; + + if ((bool) testData.VENTRYI[ri] != match) { + cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag + << " (" << currRow << ", " << currCol << ")" << endl ; + + cout << "The chars are: " << hapChar << " and " << rsChar << endl ; + cout << "The selected value is: " << testData.VENTRYI[ri] << endl ; + + exit(0) ; + } + } +#endif // #ifdef CHECK_MASK_CORRECTNESS + + if (diag < COLS) + ++col ; + + } // mbi + } // maskIndex + + beginRowIndex += VEC_ENTRY_CNT ; + } // end of stripe + + //cout << "Finished validating entry " << endl ; +} + + +int main () { + + #define BATCH_SIZE 10000 + + ConvertChar::init() ; + + testcase* tcBatch = new testcase[BATCH_SIZE] ; + + int numBatches = 0 ; + int numRead = 0 ; + + const int DEBUG_TC = -1 ; + + FILE* ifp = stdin ; + + do { + numRead = 0 ; + + while (numRead < BATCH_SIZE) { + if (read_testcase(tcBatch+numRead, ifp) < 0) + break ; + + ++numRead ; + } + + if (numRead == 0) + break ; + + for (int ti=0; ti < numRead; ++ti) { + + int tcID = numBatches * BATCH_SIZE + ti ; + test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ; + } + + ++numBatches ; + } while (numRead == BATCH_SIZE) ; + + fclose(ifp) ; + + delete[] tcBatch ; + + return 0 ; +} diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc new file mode 100644 index 000000000..80065b8c9 --- /dev/null +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -0,0 +1,146 @@ +#include +#include +#include +#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "template.h" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +using namespace std; + + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +//#define DEBUG3 1 +#define DEBUG 1 + +template +string to_string(T obj) +{ + stringstream ss; + string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} + +void debug_dump(string filename, string s, bool to_append, bool add_newline=true) +{ + ofstream fptr; + fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + fptr << s; + if(add_newline) + fptr << "\n"; + fptr.close(); +} + +#define INT_STORE_ARRAY_SIZE 2048 +int insertionGOPIntArray[INT_STORE_ARRAY_SIZE]; +int deletionGOPIntArray[INT_STORE_ARRAY_SIZE]; +int overallGCPIntArray[INT_STORE_ARRAY_SIZE]; +int readQualsIntArray[INT_STORE_ARRAY_SIZE]; + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( + JNIEnv* env, jobject thisObject, + jint readLength, jint haplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, + jint hapStartIndex + ) +{ + jboolean is_copy = JNI_FALSE; + jbyte* readBasesArray = (env)->GetByteArrayElements(readBases, &is_copy); + jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy); + jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy); + jbyte* insertionGOPArray = (env)->GetByteArrayElements(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy); + jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); + assert(readLength < INT_STORE_ARRAY_SIZE); +#endif + for(unsigned i=0;i(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); +#ifdef DEBUG + debug_dump("return_values_jni.txt",to_string(result),true); +#endif + + + env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT); + env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT); + env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT); + env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT); + env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT); + env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT); + + return 0.0; +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities +(JNIEnv* env, jclass thisObject, + jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP + ) +{} + + JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize +(JNIEnv* env, jobject thisObject, + jint readMaxLength, jint haplotypeMaxLength) +{} + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells( + JNIEnv* env, jobject thisObject, + jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jint hapStartIndex + ) + +{ return 0.0; } diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h new file mode 100644 index 000000000..5408d2f57 --- /dev/null +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -0,0 +1,61 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM */ + +#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#define _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#ifdef __cplusplus +extern "C" { +#endif +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION 3.0 +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch 1L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion 2L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion 3L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitialize + * Signature: (II)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize + (JNIEnv *, jobject, jint, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializeProbabilities + * Signature: ([[D[B[B[B)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities + (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializePriorsAndUpdateCells + * Signature: (ZII[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells + (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint); + + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10 + * Signature: (II[B[B[B[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 + (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); + + +#ifdef __cplusplus +} +#endif +#endif diff --git a/pairhmm-1-base.cc b/pairhmm-1-base.cc new file mode 100644 index 000000000..dacd02536 --- /dev/null +++ b/pairhmm-1-base.cc @@ -0,0 +1,276 @@ +#include +#include +#include +#include + +#include + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +//#define DEBUG + +/* + q: read quality + i: insertion penalty + d: deletion penalty + c: gap continuation penalty +*/ + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; +} testcase; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +int read_testcase(testcase *tc) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, stdin); + if (read == -1) + return -1; + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + } + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +template +struct Context{}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } + double ph2pr[128]; + double INITIAL_CONSTANT; + double LOG10_INITIAL_CONSTANT; + double RESULT_THRESHOLD; +}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } + float ph2pr[128]; + float INITIAL_CONSTANT; + float LOG10_INITIAL_CONSTANT; + float RESULT_THRESHOLD; +}; + +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) +{ + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + + Context ctx; + + NUMBER M[ROWS][COLS]; + NUMBER X[ROWS][COLS]; + NUMBER Y[ROWS][COLS]; + NUMBER p[ROWS][6]; + + p[0][MM] = ctx._(0.0); + p[0][GapM] = ctx._(0.0); + p[0][MX] = ctx._(0.0); + p[0][XX] = ctx._(0.0); + p[0][MY] = ctx._(0.0); + p[0][YY] = ctx._(0.0); + for (r = 1; r < ROWS; r++) + { + int _i = tc->i[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; + p[r][MX] = ctx.ph2pr[_i]; + p[r][XX] = ctx.ph2pr[_c]; + p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + + NUMBER result = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + for (c = 1; c < COLS; c++) + { + char _rs = tc->rs[r-1]; + char _hap = tc->hap[c-1]; + int _q = tc->q[r-1] & 127; + NUMBER distm = ctx.ph2pr[_q]; + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; + } + + for (c = 0; c < COLS; c++) + { + result += M[ROWS-1][c] + X[ROWS-1][c]; + } + + if (before_last_log != NULL) + *before_last_log = result; + + return result; //ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +#define BATCH_SIZE 10000 +#define RUN_HYBRID + +int main() +{ + testcase tc[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + struct timeval start, end; + long long aggregateTimeRead = 0L; + long long aggregateTimeCompute = 0L; + long long aggregateTimeWrite = 0L; + + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; + gettimeofday(&start, NULL); + for (int b=0;b(&tc[b]); + + #ifdef RUN_HYBRID + #define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = compute_full_prob(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + #ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + } + gettimeofday(&end, NULL); + aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + gettimeofday(&start, NULL); + for (int b=0;b +#include +#include + +/* +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} +*/ + +void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = MAIN_TYPE_SIZE ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) + maskArr[mIndex][ci] |= bitMask ; + } + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + +void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { + + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + for (int ei=0; ei < AVX_LENGTH; ++ei) { + lastMaskShiftOut[ei] = 0 ; + } +} + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ + MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { + + for (int ei=0; ei < AVX_LENGTH/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +} + +//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { + +inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { + //#define computeDistVec() { + + _256_TYPE maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; + + distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; + + /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ + + VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; + VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; +} + + +/* +template +struct HmmData { + int ROWS ; + int COLS ; + + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ; + Context ctx ; + testcase* tc ; + _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ; + + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ; + UNION_TYPE rs , rsN ; + _256_TYPE distmSel; + _256_TYPE distm, _1_distm; + +} ; +*/ + +template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +{ + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + for (int s=0;si[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + + *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; + *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; + *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + NUMBER *ptr_distm1D = (NUMBER *)distm1D; + for (int r = 1; r < ROWS; r++) + { + int _q = tc->q[r-1] & 127; + ptr_distm1D[r-1] = ctx.ph2pr[_q]; + } +} + + +template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( + int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, + _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , + _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) +{ + int i = stripIdx; + pGAPM = p_GAPM[i]; + pMM = p_MM[i]; + pMX = p_MX[i]; + pXX = p_XX[i]; + pMY = p_MY[i]; + pYY = p_YY[i]; + + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + + /* compare rs and N */ + //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); + //rsN.d = VEC_CMP_EQ(N_packed256, rs); + + distm = distm1D[i]; + _1_distm = VEC_SUB(packed1.d, distm); + + /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ + M_t_2.d = VEC_SET1_VAL(zero); + X_t_2.d = VEC_SET1_VAL(zero); + + if (i==0) { + M_t_1.d = VEC_SET1_VAL(zero); + X_t_1.d = VEC_SET1_VAL(zero); + Y_t_2.d = VEC_SET_LSE(init_Y); + Y_t_1.d = VEC_SET1_VAL(zero); + } + else { + X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); + M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); + Y_t_2.d = VEC_SET1_VAL(zero); + Y_t_1.d = VEC_SET1_VAL(zero); + } + M_t_1_y = M_t_1; +} + + + +inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, + _256_TYPE distm, _256_TYPE _1_distm) +{ + UNION_TYPE hapN, rshap; + _256_TYPE cond; + IF_32 shiftInHap; + + int *hap_ptr = tc->ihap; + + shiftInHap.i = (d NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +{ + _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; + _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; + _256_TYPE distm1D[MAVX_COUNT]; + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH]; + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; + + struct timeval start, end; + NUMBER result_avx2; + Context ctx; + UNION_TYPE rs , rsN; + HAP_TYPE hap; + _256_TYPE distmSel, distmChosen ; + _256_TYPE distm, _1_distm; + + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + int AVX_COUNT = (ROWS+7)/8; + NUMBER zero = ctx._(0.0); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + _256_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + int remainingRows = (ROWS-1) % AVX_LENGTH; + int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); + + const int maskBitCnt = MAIN_TYPE_SIZE ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + + char rsArr[AVX_LENGTH] ; + MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; + + GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, + ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); + + //for (int __ii=0; __ii < 10; ++__ii) + for (int i=0;i +#include +#include +#include + +#include "template.h" + +#include "define-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#define BATCH_SIZE 10000 +//#define RUN_HYBRID + +uint8_t ConvertChar::conversionTable[255] ; +int thread_level_parallelism_enabled = false ; + +double getCurrClk() { + struct timeval tv ; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + +int main() +{ + testcase tc[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + //struct timeval start, end; + double lastClk = 0.0 ; + double aggregateTimeRead = 0.0; + double aggregateTimeCompute = 0.0; + double aggregateTimeWrite = 0.0; + + // Need to call it once to initialize the static array + ConvertChar::init() ; + + + char* ompEnvVar = getenv("OMP_NUM_THREADS") ; + if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { + thread_level_parallelism_enabled = true ; + } + + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; + + lastClk = getCurrClk() ; + for (int b=0;b(&tc[b]); + + #ifdef RUN_HYBRID + #define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + #ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + } + //gettimeofday(&end, NULL); + aggregateTimeCompute += (getCurrClk() - lastClk) ; + //((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + //gettimeofday(&start, NULL); + lastClk = getCurrClk() ; + for (int b=0;b +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#define MROWS 500 +#define MCOLS 1000 + +#define CAT(X,Y) X####Y +#define GEN_INTRINSIC(X,Y) CAT(X,Y) + +#define ALIGNED __attribute__((aligned(32))) + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256 ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED float ALIGNED f[8]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_F ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128 vecf ; + uint32_t masks[4] ; +} MaskVec_F ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128 ALIGNED f; +} ALIGNED IF_128f ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int ALIGNED i; + ALIGNED float ALIGNED f; +} ALIGNED IF_32 ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256d ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED double ALIGNED f[4]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_D ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128d vecf ; + uint64_t masks[2] ; +} MaskVec_D ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128d ALIGNED f; +} ALIGNED IF_128d ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int64_t ALIGNED i; + ALIGNED double ALIGNED f; +} ALIGNED IF_64 ALIGNED; + +template +struct Context{}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } + double ph2pr[128]; + double INITIAL_CONSTANT; + double LOG10_INITIAL_CONSTANT; + double RESULT_THRESHOLD; +}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + { + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + } + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } + float ph2pr[128]; + float INITIAL_CONSTANT; + float LOG10_INITIAL_CONSTANT; + float RESULT_THRESHOLD; +}; + + +typedef struct +{ + int rslen, haplen, *q, *i, *d, *c; + char *hap, *rs; + int *ihap; + int *irs; +} testcase; + +int normalize(char c); + + +int read_testcase(testcase *tc) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, stdin); + if (read == -1) + return -1; + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + tc->ihap = (int *) malloc(size*sizeof(int)); + tc->irs = (int *) malloc(size*sizeof(int)); + + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + tc->q = (int *) malloc(sizeof(int) * tc->rslen); + tc->i = (int *) malloc(sizeof(int) * tc->rslen); + tc->d = (int *) malloc(sizeof(int) * tc->rslen); + tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; + } + + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +class ConvertChar { + + static uint8_t conversionTable[255] ; + +public: + + static void init() { + assert (NUM_DISTINCT_CHARS == 5) ; + assert (AMBIG_CHAR == 4) ; + + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +} ; + + +#endif + + From 8240ea826ebe1f47ba7548cb8971f4d5803b0fbb Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Wed, 15 Jan 2014 10:48:58 -0800 Subject: [PATCH 2/6] Changes: 1. Added TRISTATE_CORRECTION in pairhmm-template-kernel.cc (function stripINITIALIZATION) 2. Added VEC_DIV macros to define-double.h and define-float.h 3. Edited initializeVectors to match Java C++ original: *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; Modified: *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; --- .gitignore | 2 ++ Makefile | 19 +++++++------ define-double.h | 4 +++ define-float.h | 4 +++ hmm_mask.cc | 3 +- ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 28 +++++++++---------- pairhmm-template-kernel.cc | 24 ++++++++++++++-- pairhmm-template-main.cc | 2 +- template.h | 15 ++++++++++ 9 files changed, 72 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 16e86f25d..dcc44daa4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ tests .deps hmm_Mohammad +pairhmm-template-main +*.swp diff --git a/Makefile b/Makefile index df5dbc463..f858a7241 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -#OMPCFLAGS=-fopenmp +OMPCFLAGS=-fopenmp #OMPLDFLAGS=-lgomp #CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas @@ -7,7 +7,7 @@ JAVA_ROOT=/opt/jdk1.7.0_25/ JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux -CFLAGS=-g -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX +CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX CXXFLAGS=$(CFLAGS) CC=icc @@ -16,11 +16,12 @@ CXX=icc LDFLAGS=-lm $(OMPLDFLAGS) #BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev -BIN:=libJNILoglessPairHMM.so #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev +BIN:=libJNILoglessPairHMM.so pairhmm-template-main #SOURCES=pairhmm-1-base.cc input.cc -SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc -OBJECTS=$(SOURCES:.cc=.o) +LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc +SOURCES=$(LIBSOURCES) pairhmm-template-main.cc +LIBOBJECTS=$(LIBSOURCES:.cc=.o) DEPDIR=.deps DF=$(DEPDIR)/$(*).d @@ -28,11 +29,11 @@ all: $(BIN) -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) -pairhmm-1-base: pairhmm-1-base.o input.o - $(CXX) -o $@ $^ $(LDFLAGS) +pairhmm-template-main: pairhmm-template-main.o hmm_mask.o + $(CXX) -fopenmp -o $@ $^ $(LDFLAGS) -libJNILoglessPairHMM.so: $(OBJECTS) - $(CXX) -shared -o libJNILoglessPairHMM.so $(OBJECTS) +libJNILoglessPairHMM.so: $(LIBOBJECTS) + $(CXX) -shared -o $@ $(LIBOBJECTS) %.o: %.cc @mkdir -p $(DEPDIR) diff --git a/define-double.h b/define-double.h index b2de7520a..5d31741d6 100644 --- a/define-double.h +++ b/define-double.h @@ -23,6 +23,7 @@ #undef VEC_ADD #undef VEC_SUB #undef VEC_MUL + #undef VEC_DIV #undef VEC_BLEND #undef VEC_BLENDV #undef VEC_CAST_256_128 @@ -82,6 +83,9 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_pd(__v1, __v2) +#define VEC_DIV(__v1, __v2) \ + _mm256_div_pd(__v1, __v2) + #define VEC_BLEND(__v1, __v2, __mask) \ _mm256_blend_pd(__v1, __v2, __mask) diff --git a/define-float.h b/define-float.h index 4cba9a179..0e2822b2d 100644 --- a/define-float.h +++ b/define-float.h @@ -23,6 +23,7 @@ #undef VEC_ADD(__v1, __v2) #undef VEC_SUB(__v1, __v2) #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) #undef VEC_BLEND(__v1, __v2, __mask) #undef VEC_BLENDV(__v1, __v2, __maskV) #undef VEC_CAST_256_128(__v1) @@ -83,6 +84,9 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_ps(__v1, __v2) +#define VEC_DIV(__v1, __v2) \ + _mm256_div_ps(__v1, __v2) + #define VEC_BLEND(__v1, __v2, __mask) \ _mm256_blend_ps(__v1, __v2, __mask) diff --git a/hmm_mask.cc b/hmm_mask.cc index aace7acf2..f128a39d9 100644 --- a/hmm_mask.cc +++ b/hmm_mask.cc @@ -438,7 +438,7 @@ void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { //cout << "Finished validating entry " << endl ; } - +#ifdef HMM_MASK_MAIN int main () { #define BATCH_SIZE 10000 @@ -482,3 +482,4 @@ int main () { return 0 ; } +#endif diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index 80065b8c9..29668a306 100644 --- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -15,6 +15,10 @@ #include #include #include +using namespace std; +//#define DEBUG3 1 +#define DEBUG 1 + #include "template.h" @@ -22,7 +26,6 @@ #include "shift_template.c" #include "pairhmm-template-kernel.cc" -using namespace std; #define MM 0 @@ -32,22 +35,17 @@ using namespace std; #define MY 4 #define YY 5 -//#define DEBUG3 1 -#define DEBUG 1 - -template -string to_string(T obj) +class LoadTimeInitializer { - stringstream ss; - string ret_string; - ss.clear(); - ss << std::scientific << obj; - ss >> ret_string; - ss.clear(); - return ret_string; -} + public: + LoadTimeInitializer() //will be called when library is loaded + { + ConvertChar::init(); + } +}; +LoadTimeInitializer g_load_time_initializer; -void debug_dump(string filename, string s, bool to_append, bool add_newline=true) +void debug_dump(string filename, string s, bool to_append, bool add_newline) { ofstream fptr; fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); diff --git a/pairhmm-template-kernel.cc b/pairhmm-template-kernel.cc index dd71373b0..24bcf7d1a 100644 --- a/pairhmm-template-kernel.cc +++ b/pairhmm-template-kernel.cc @@ -157,8 +157,18 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; - *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; +#ifdef DEBUG3 + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true); +#endif + //*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; } NUMBER *ptr_distm1D = (NUMBER *)distm1D; @@ -166,6 +176,9 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS { int _q = tc->q[r-1] & 127; ptr_distm1D[r-1] = ctx.ph2pr[_q]; +#ifdef DEBUG3 + debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true); +#endif } } @@ -187,13 +200,18 @@ template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION) NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); +#define TRISTATE_CORRECTION_FACTOR 3.0 + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR); /* compare rs and N */ //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); //rsN.d = VEC_CMP_EQ(N_packed256, rs); distm = distm1D[i]; - _1_distm = VEC_SUB(packed1.d, distm); + _1_distm = VEC_SUB(packed1.d, distm); +#ifndef DO_NOT_USE_TRISTATE_CORRECTION + distm = VEC_DIV(distm, packed3.d); +#endif /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ M_t_2.d = VEC_SET1_VAL(zero); diff --git a/pairhmm-template-main.cc b/pairhmm-template-main.cc index f20755b6b..87d50eb5b 100644 --- a/pairhmm-template-main.cc +++ b/pairhmm-template-main.cc @@ -16,7 +16,7 @@ #define BATCH_SIZE 10000 //#define RUN_HYBRID -uint8_t ConvertChar::conversionTable[255] ; +//uint8_t ConvertChar::conversionTable[255] ; int thread_level_parallelism_enabled = false ; double getCurrClk() { diff --git a/template.h b/template.h index ee8036a11..4c8a4b5e3 100644 --- a/template.h +++ b/template.h @@ -13,6 +13,7 @@ #include #include +#include #define MROWS 500 #define MCOLS 1000 @@ -132,6 +133,20 @@ typedef struct int *irs; } testcase; + +template +std::string to_string(T obj) +{ + stringstream ss; + string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + int normalize(char c); From e90405cd1fd1f8d824194f91f971aa32da667cc7 Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Thu, 16 Jan 2014 19:53:50 -0800 Subject: [PATCH 3/6] 1. Nested loops over reads and haplotypes moved to C++ through JNI 2. OpenMP support added 3. Using direct access to Java primitive arrays 4. Debug messages disabled --- .gitignore | 2 + Makefile | 17 +- convert_char.cc | 186 ++++++++ hmm_mask.cc | 60 +-- ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 417 +++++++++++++++--- ...te_sting_utils_pairhmm_JNILoglessPairHMM.h | 15 + pairhmm-1-base.cc | 324 ++++++-------- pairhmm-template-main.cc | 8 +- template.h | 79 +--- 9 files changed, 737 insertions(+), 371 deletions(-) create mode 100644 convert_char.cc diff --git a/.gitignore b/.gitignore index dcc44daa4..cb70b53a7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ tests hmm_Mohammad pairhmm-template-main *.swp +checker +reformat diff --git a/Makefile b/Makefile index f858a7241..70bf96dd3 100644 --- a/Makefile +++ b/Makefile @@ -16,11 +16,11 @@ CXX=icc LDFLAGS=-lm $(OMPLDFLAGS) #BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev -BIN:=libJNILoglessPairHMM.so pairhmm-template-main +BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker #SOURCES=pairhmm-1-base.cc input.cc -LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc -SOURCES=$(LIBSOURCES) pairhmm-template-main.cc +LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc +SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc LIBOBJECTS=$(LIBSOURCES:.cc=.o) DEPDIR=.deps DF=$(DEPDIR)/$(*).d @@ -29,11 +29,14 @@ all: $(BIN) -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) -pairhmm-template-main: pairhmm-template-main.o hmm_mask.o - $(CXX) -fopenmp -o $@ $^ $(LDFLAGS) +checker: pairhmm-1-base.o convert_char.o + $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) + +pairhmm-template-main: pairhmm-template-main.o convert_char.o + $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) libJNILoglessPairHMM.so: $(LIBOBJECTS) - $(CXX) -shared -o $@ $(LIBOBJECTS) + $(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) %.o: %.cc @mkdir -p $(DEPDIR) @@ -41,4 +44,4 @@ libJNILoglessPairHMM.so: $(LIBOBJECTS) clean: - rm -f $(BIN) *.o + rm -rf $(BIN) *.o $(DEPDIR) diff --git a/convert_char.cc b/convert_char.cc new file mode 100644 index 000000000..01c5a5137 --- /dev/null +++ b/convert_char.cc @@ -0,0 +1,186 @@ +#include "template.h" +uint8_t ConvertChar::conversionTable[255]; +using namespace std; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); + if (read == -1) + return -1; + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + assert(tc->rslen < MROWS); + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + //tc->q = (int *) malloc(sizeof(int) * tc->rslen); + //tc->i = (int *) malloc(sizeof(int) * tc->rslen); + //tc->d = (int *) malloc(sizeof(int) * tc->rslen); + //tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; + } + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +unsigned MAX_LINE_LENGTH = 65536; +int convToInt(std::string s) +{ + int i; + std::istringstream strin(s); + strin >> i; + return i; +} + +void tokenize(std::ifstream& fptr, std::vector& tokens) +{ + int i = 0; + std::string tmp; + std::vector myVec; + vector line; + line.clear(); + line.resize(MAX_LINE_LENGTH); + vector tmpline; + tmpline.clear(); + tmpline.resize(MAX_LINE_LENGTH); + myVec.clear(); + + while(!fptr.eof()) + { + i = 0; + bool still_read_line = true; + unsigned line_position = 0; + while(still_read_line) + { + fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH); + if(line_position + MAX_LINE_LENGTH > line.size()) + line.resize(2*line.size()); + for(unsigned i=0;i> std::skipws >> tmp; + if(tmp != "") + { + myVec.push_back(tmp); + ++i; + //std::cout < 0) + break; + } + tokens.clear(); + //std::cout << "Why "< tokens; + tokens.clear(); + tokenize(fptr, tokens); + if(tokens.size() == 0) + return -1; + tc->hap = new char[tokens[0].size()+2]; + tc->haplen = tokens[0].size(); + memcpy(tc->hap, tokens[0].c_str(), tokens[0].size()); + tc->rs = new char[tokens[1].size()+2]; + tc->rslen = tokens[1].size(); + //cout << "Lengths "<haplen <<" "<rslen<<"\n"; + memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); + assert(tokens.size() == 2 + 4*(tc->rslen)); + assert(tc->rslen < MROWS); + for(unsigned j=0;jrslen;++j) + tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]); + + if(reformat) + { + ofstream ofptr; + ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app); + assert(ofptr.is_open()); + ofptr << tokens[0] << " "; + ofptr << tokens[1] << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->q[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->i[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->d[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->c[j]+33)); + ofptr << " 0 false\n"; + + ofptr.close(); + first_call = false; + } + + + return tokens.size(); +} diff --git a/hmm_mask.cc b/hmm_mask.cc index f128a39d9..c335a0160 100644 --- a/hmm_mask.cc +++ b/hmm_mask.cc @@ -30,9 +30,7 @@ #include #include #include -#include -#include -#include +#include "template.h" using namespace std ; @@ -50,43 +48,14 @@ string getBinaryStr (T val, int numBitsToWrite) { return oss.str() ; } - -typedef struct -{ - int rslen, haplen, *q, *i, *d, *c; - char *hap, *rs; -} testcase; - int normalize(char c) { return ((int) (c - 33)); } -class ConvertChar { - - static uint8_t conversionTable[255] ; - - -public: - - static void init() { - conversionTable['A'] = 0 ; - conversionTable['C'] = 1 ; - conversionTable['T'] = 2 ; - conversionTable['G'] = 3 ; - conversionTable['N'] = 4 ; - } - - static inline uint8_t get(uint8_t input) { - return conversionTable[input] ; - } - -} ; uint8_t ConvertChar::conversionTable[255] ; - - int read_testcase(testcase *tc, FILE* ifp) { char *q, *i, *d, *c, *line = NULL; @@ -94,7 +63,7 @@ int read_testcase(testcase *tc, FILE* ifp) int x, size = 0; ssize_t read; - read = getline(&line, (size_t *) &size, ifp); + read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); if (read == -1) return -1; @@ -111,20 +80,14 @@ int read_testcase(testcase *tc, FILE* ifp) tc->haplen = strlen(tc->hap); tc->rslen = strlen(tc->rs); - tc->q = (int *) malloc(sizeof(int) * tc->rslen); - tc->i = (int *) malloc(sizeof(int) * tc->rslen); - tc->d = (int *) malloc(sizeof(int) * tc->rslen); - tc->c = (int *) malloc(sizeof(int) * tc->rslen); + assert(tc->rslen < MROWS); + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); - // Convert hap and rs to 3 bits - - /* - for (int ci=0; ci < tc->haplen; ++ci) - tc->hap[ci] = ConvertChar::get(tc->hap[ci]) ; - - for (int ci=0; ci < tc->rslen; ++ci) - tc->rs[ci] = ConvertChar::get(tc->rs[ci]) ; - */ + //tc->q = (int *) malloc(sizeof(int) * tc->rslen); + //tc->i = (int *) malloc(sizeof(int) * tc->rslen); + //tc->d = (int *) malloc(sizeof(int) * tc->rslen); + //tc->c = (int *) malloc(sizeof(int) * tc->rslen); for (x = 0; x < tc->rslen; x++) { @@ -136,8 +99,11 @@ int read_testcase(testcase *tc, FILE* ifp) tc->i[x] = _i; tc->d[x] = _d; tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; } - + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + free(q); free(i); diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index 29668a306..b8298161e 100644 --- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -2,6 +2,7 @@ #include #include #include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h" +#include #include #include @@ -12,12 +13,14 @@ #include #include + #include #include #include using namespace std; +//#define DEBUG 1 +//#define DEBUG0_1 1 //#define DEBUG3 1 -#define DEBUG 1 #include "template.h" @@ -42,6 +45,12 @@ class LoadTimeInitializer { ConvertChar::init(); } + jfieldID m_readBasesFID; + jfieldID m_readQualsFID; + jfieldID m_insertionGOPFID; + jfieldID m_deletionGOPFID; + jfieldID m_overallGCPFID; + jfieldID m_haplotypeBasesFID; }; LoadTimeInitializer g_load_time_initializer; @@ -55,73 +64,6 @@ void debug_dump(string filename, string s, bool to_append, bool add_newline) fptr.close(); } -#define INT_STORE_ARRAY_SIZE 2048 -int insertionGOPIntArray[INT_STORE_ARRAY_SIZE]; -int deletionGOPIntArray[INT_STORE_ARRAY_SIZE]; -int overallGCPIntArray[INT_STORE_ARRAY_SIZE]; -int readQualsIntArray[INT_STORE_ARRAY_SIZE]; - -JNIEXPORT jdouble JNICALL -Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( - JNIEnv* env, jobject thisObject, - jint readLength, jint haplotypeLength, - jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, - jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, - jint hapStartIndex - ) -{ - jboolean is_copy = JNI_FALSE; - jbyte* readBasesArray = (env)->GetByteArrayElements(readBases, &is_copy); - jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy); - jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy); - jbyte* insertionGOPArray = (env)->GetByteArrayElements(insertionGOP, &is_copy); - jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy); - jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy); -#ifdef DEBUG - assert(readBasesArray && "readBasesArray not initialized in JNI"); - assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); - assert(readQualsArray && "readQualsArray not initialized in JNI"); - assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); - assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); - assert(overallGCPArray && "OverallGCP array not initialized in JNI"); - assert(readLength < INT_STORE_ARRAY_SIZE); -#endif - for(unsigned i=0;i(&tc); - double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); -#ifdef DEBUG - debug_dump("return_values_jni.txt",to_string(result),true); -#endif - - - env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT); - env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT); - env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT); - env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT); - env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT); - env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT); - - return 0.0; -} - JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities (JNIEnv* env, jclass thisObject, jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP @@ -142,3 +84,342 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior ) { return 0.0; } + +#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1 + +#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY +//Gets direct access to Java arrays +#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical +#define JNI_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical + +#else +//Likely makes copy of Java arrays to JNI C++ space +#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements +#define JNI_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements + +#endif //ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( + JNIEnv* env, jobject thisObject, + jint readLength, jint haplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, + jint hapStartIndex + ) +{ + jboolean is_copy = JNI_FALSE; + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); + assert(readLength < MROWS); +#endif + testcase tc; + tc.rslen = readLength; + tc.haplen = haplotypeLength; + tc.rs = (char*)readBasesArray; + tc.hap = (char*)haplotypeBasesArray; + for(unsigned i=0;i(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); +#ifdef DEBUG + debug_dump("return_values_jni.txt",to_string(result),true); +#endif + + + RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE); + + return 0.0; +} + +//Should be called only once for the whole Java process - initializes field ids for the classes JNIReadDataHolderClass +//and JNIHaplotypeDataHolderClass +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit + (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass) +{ + assert(readDataHolderClass); + assert(haplotypeDataHolderClass); + jfieldID fid; + fid = env->GetFieldID(readDataHolderClass, "readBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readBases"); + g_load_time_initializer.m_readBasesFID = fid; + fid = env->GetFieldID(readDataHolderClass, "readQuals", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readQuals"); + g_load_time_initializer.m_readQualsFID = fid; + fid = env->GetFieldID(readDataHolderClass, "insertionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for insertionGOP"); + g_load_time_initializer.m_insertionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "deletionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for deletionGOP"); + g_load_time_initializer.m_deletionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "overallGCP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for overallGCP"); + g_load_time_initializer.m_overallGCPFID = fid; + + fid = env->GetFieldID(haplotypeDataHolderClass, "haplotypeBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for haplotypeBases"); + g_load_time_initializer.m_haplotypeBasesFID = fid; +} + + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods + (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, + jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +{ + jboolean is_copy = JNI_FALSE; + //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector + vector > haplotypeBasesArrayVector; + haplotypeBasesArrayVector.clear(); + haplotypeBasesArrayVector.resize(numHaplotypes); +#ifdef DEBUG0_1 + cout << "JNI numReads "<GetObjectArrayElement(haplotypeDataArray, j); + jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID); +#ifdef DEBUG + assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str()); +#endif + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); +#ifdef DEBUG + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(env->GetArrayLength(haplotypeBases) < MCOLS); +#ifdef DEBUG0_1 + cout << "JNI haplotype length "<GetArrayLength(haplotypeBases)<<"\n"; +#endif +#endif + haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray); +#ifdef DEBUG3 + for(unsigned k=0;kGetArrayLength(haplotypeBases);++k) + debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); +#endif + } + + unsigned numTestCases = numReads*numHaplotypes; + vector tc_array; + tc_array.clear(); + tc_array.resize(numTestCases); + unsigned tc_idx = 0; + //Store arrays for release later + vector > > readBasesArrayVector; + readBasesArrayVector.clear(); + readBasesArrayVector.resize(numReads); + for(unsigned i=0;iGetObjectArrayElement(readDataArray, i); + jbyteArray readBases = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readBasesFID); + jbyteArray insertionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_insertionGOPFID); + jbyteArray deletionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_deletionGOPFID); + jbyteArray overallGCP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_overallGCPFID); + jbyteArray readQuals = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readQualsFID); + +#ifdef DEBUG + assert(readBases && ("readBases is NULL at index : "+to_string(i)+"\n").c_str()); + assert(insertionGOP && ("insertionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(deletionGOP && ("deletionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(overallGCP && ("overallGCP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(readQuals && ("readQuals is NULL at index : "+to_string(i)+"\n").c_str()); +#endif + jsize readLength = env->GetArrayLength(readBases); + + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); //order of GET-RELEASE is important + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "overallGCP array not initialized in JNI"); + assert(readLength < MROWS); + assert(readLength == env->GetArrayLength(readQuals)); + assert(readLength == env->GetArrayLength(insertionGOP)); + assert(readLength == env->GetArrayLength(deletionGOP)); + assert(readLength == env->GetArrayLength(overallGCP)); +#ifdef DEBUG0_1 + cout << "JNI read length "<GetArrayLength(haplotypeBasesArrayVector[j].first); + jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second; + tc_array[tc_idx].rslen = (int)readLength; + tc_array[tc_idx].haplen = (int)haplotypeLength; + tc_array[tc_idx].rs = (char*)readBasesArray; + tc_array[tc_idx].hap = (char*)haplotypeBasesArray; + //Can be avoided + for(unsigned k=0;kGetArrayLength(likelihoodArray) == numTestCases); +#pragma omp parallel for schedule (guided,2) private(tc_idx) num_threads(maxNumThreadsToUse) + for(tc_idx=0;tc_idx(&(tc_array[tc_idx])); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); + likelihoodDoubleArray[tc_idx] = result; + } +#ifdef DEBUG + for(tc_idx=0;tc_idx=0;--i)//note the order - reverse of GET + { + for(int j=readBasesArrayVector.size()-1;j>=0;--j) + RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE); + readBasesArrayVector[i].clear(); + } + readBasesArrayVector.clear(); + + //Now release haplotype arrays + for(int j=numHaplotypes-1;j>=0;--j) //note the order - reverse of GET + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE); + haplotypeBasesArrayVector.clear(); + tc_array.clear(); +} + + + + + /* + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, true, 0); + if ( ! MathUtils.goodLog10Probability(result) ) + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + // KG: seems pointless is not being used anywhere + previousHaplotypeBases = haplotypeBases; + return result; + return 0; + }*/ + +//public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) + //{ + ////PerReadAlleleLikelihoodMap retValue = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); + //// (re)initialize the pairHMM only if necessary + //final int readMaxLength = findMaxReadLength(reads); + //final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + //if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) + //{ initialize(readMaxLength, haplotypeMaxLength); } + + //if ( ! initialized ) + //throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods"); + //final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + //jniComputeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap, likelihoodMap); + //for(GATKSAMRecord read : reads){ + //final byte[] readBases = read.getReadBases(); + //final byte[] readQuals = read.getBaseQualities(); + //final byte[] readInsQuals = read.getBaseInsertionQualities(); + //final byte[] readDelQuals = read.getBaseDeletionQualities(); + //final byte[] overallGCP = GCPArrayMap.get(read); + + //// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + //byte[] currentHaplotypeBases = null; + //boolean isFirstHaplotype = true; + //Allele currentAllele = null; + //double log10l; + //for (final Allele allele : alleleHaplotypeMap.keySet()){ + //final Haplotype haplotype = alleleHaplotypeMap.get(allele); + //final byte[] nextHaplotypeBases = haplotype.getBases(); + //if (currentHaplotypeBases != null) { + //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + //likelihoodMap.add(read, currentAllele, log10l); + //} + //// update the current haplotype + //currentHaplotypeBases = nextHaplotypeBases; + //currentAllele = allele; + //} + //// process the final haplotype + //if (currentHaplotypeBases != null) { + + //// there is no next haplotype, so pass null for nextHaplotypeBases. + //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + //likelihoodMap.add(read, currentAllele, log10l); + //} + //} + //if(debug) + //debugClose(); + //return likelihoodMap; + //} diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h index 5408d2f57..1fff9fa5a 100644 --- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -54,6 +54,21 @@ JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILogless JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniGlobalInit + * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit + (JNIEnv *, jobject, jclass, jclass); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniComputeLikelihoods + * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods + (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); #ifdef __cplusplus } diff --git a/pairhmm-1-base.cc b/pairhmm-1-base.cc index dacd02536..90f95aeec 100644 --- a/pairhmm-1-base.cc +++ b/pairhmm-1-base.cc @@ -1,10 +1,6 @@ -#include -#include -#include -#include - -#include - +//#define DEBUG 1 +//#define DEBUG0_1 1 +//#define DEBUG3 1 #define MM 0 #define GapM 1 #define MX 2 @@ -12,123 +8,34 @@ #define MY 4 #define YY 5 -//#define DEBUG +#include +#include +#include +#include +#include +#include +#include "template.h" -/* - q: read quality - i: insertion penalty - d: deletion penalty - c: gap continuation penalty -*/ +//#include "define-float.h" +//#include "shift_template.c" +//#include "pairhmm-template-kernel.cc" -typedef struct +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + + +using namespace std; +class LoadTimeInitializer { - int rslen, haplen, *q, *i, *d, *c; - char *hap, *rs; -} testcase; - -int normalize(char c) -{ - return ((int) (c - 33)); -} - -int read_testcase(testcase *tc) -{ - char *q, *i, *d, *c, *line = NULL; - int _q, _i, _d, _c; - int x, size = 0; - ssize_t read; - - read = getline(&line, (size_t *) &size, stdin); - if (read == -1) - return -1; - - tc->hap = (char *) malloc(size); - tc->rs = (char *) malloc(size); - q = (char *) malloc(size); - i = (char *) malloc(size); - d = (char *) malloc(size); - c = (char *) malloc(size); - - if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) - return -1; - - tc->haplen = strlen(tc->hap); - tc->rslen = strlen(tc->rs); - tc->q = (int *) malloc(sizeof(int) * tc->rslen); - tc->i = (int *) malloc(sizeof(int) * tc->rslen); - tc->d = (int *) malloc(sizeof(int) * tc->rslen); - tc->c = (int *) malloc(sizeof(int) * tc->rslen); - - for (x = 0; x < tc->rslen; x++) - { - _q = normalize(q[x]); - _i = normalize(i[x]); - _d = normalize(d[x]); - _c = normalize(c[x]); - tc->q[x] = (_q < 6) ? 6 : _q; - tc->i[x] = _i; - tc->d[x] = _d; - tc->c[x] = _c; - } - - free(q); - free(i); - free(d); - free(c); - free(line); - - return 0; -} - -template -struct Context{}; - -template<> -struct Context -{ - Context() - { - for (int x = 0; x < 128; x++) - ph2pr[x] = pow(10.0, -((double)x) / 10.0); - - INITIAL_CONSTANT = ldexp(1.0, 1020.0); - LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); - RESULT_THRESHOLD = 0.0; - } - - double LOG10(double v){ return log10(v); } - - static double _(double n){ return n; } - static double _(float n){ return ((double) n); } - double ph2pr[128]; - double INITIAL_CONSTANT; - double LOG10_INITIAL_CONSTANT; - double RESULT_THRESHOLD; + public: + LoadTimeInitializer() //will be called when library is loaded + { + ConvertChar::init(); + } }; +LoadTimeInitializer g_load_time_initializer; -template<> -struct Context -{ - Context() - { - for (int x = 0; x < 128; x++) - ph2pr[x] = powf(10.f, -((float)x) / 10.f); - - INITIAL_CONSTANT = ldexpf(1.f, 120.f); - LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); - RESULT_THRESHOLD = ldexpf(1.f, -110.f); - } - - float LOG10(float v){ return log10f(v); } - - static float _(double n){ return ((float) n); } - static float _(float n){ return n; } - float ph2pr[128]; - float INITIAL_CONSTANT; - float LOG10_INITIAL_CONSTANT; - float RESULT_THRESHOLD; -}; template NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) @@ -139,10 +46,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) Context ctx; - NUMBER M[ROWS][COLS]; - NUMBER X[ROWS][COLS]; - NUMBER Y[ROWS][COLS]; - NUMBER p[ROWS][6]; + NUMBER M[MROWS][MCOLS]; + NUMBER X[MROWS][MCOLS]; + NUMBER Y[MROWS][MCOLS]; + NUMBER p[MROWS][6]; p[0][MM] = ctx._(0.0); p[0][GapM] = ctx._(0.0); @@ -159,8 +66,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; p[r][MX] = ctx.ph2pr[_i]; p[r][XX] = ctx.ph2pr[_c]; - p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + p[r][MY] = ctx.ph2pr[_d]; + p[r][YY] = ctx.ph2pr[_c]; + //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; } for (c = 0; c < COLS; c++) @@ -188,6 +97,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) NUMBER distm = ctx.ph2pr[_q]; if (_rs == _hap || _rs == 'N' || _hap == 'N') distm = ctx._(1.0) - distm; + else + distm = distm/3; M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; @@ -201,76 +112,127 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) if (before_last_log != NULL) *before_last_log = result; - return result; //ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; + return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; } #define BATCH_SIZE 10000 #define RUN_HYBRID -int main() +int main(int argc, char** argv) { - testcase tc[BATCH_SIZE]; - float result[BATCH_SIZE], result_avxf; - double result_avxd; - struct timeval start, end; - long long aggregateTimeRead = 0L; - long long aggregateTimeCompute = 0L; - long long aggregateTimeWrite = 0L; + if(argc < 2) + { + cerr << "Needs path to input file as argument\n"; + exit(0); + } + bool use_old_read_testcase = false; + if(argc >= 3 && string(argv[2]) == "1") + use_old_read_testcase = true; - bool noMoreData = false; - int count =0; - while (!noMoreData) - { - int read_count = BATCH_SIZE; - gettimeofday(&start, NULL); - for (int b=0;b= 0) + { + double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); - gettimeofday(&start, NULL); - for (int b=0;b(&tc[b]); + cout << std::scientific << compute_full_prob(&tc) << " "< tokens; + ifptr.open(argv[1]); + assert(ifptr.is_open()); + while(1) + { + tokens.clear(); + if(read_mod_testcase(ifptr, &tc, false) < 0) + break; + //double result = 0; + double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); - #ifdef RUN_HYBRID - #define MIN_ACCEPTED 1e-28f - if (result_avxf < MIN_ACCEPTED) { - count++; - result_avxd = compute_full_prob(&tc[b]); - result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); - } - else - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); - #endif + cout << std::scientific << compute_full_prob(&tc) << " "<(&tc[b]); - } +#ifdef RUN_HYBRID +#define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = compute_full_prob(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif - printf("AVX Read Time: %ld\n", aggregateTimeRead); - printf("AVX Compute Time: %ld\n", aggregateTimeCompute); - printf("AVX Write Time: %ld\n", aggregateTimeWrite); - printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite); - printf("# Double called: %d\n", count); +#ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif - return 0; + } + gettimeofday(&end, NULL); + aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + gettimeofday(&start, NULL); + for (int b=0;b #include +#include +#include +#include +#include + #define MROWS 500 #define MCOLS 1000 @@ -127,18 +132,20 @@ struct Context typedef struct { - int rslen, haplen, *q, *i, *d, *c; + int rslen, haplen; + /*int *q, *i, *d, *c;*/ + int q[MROWS], i[MROWS], d[MROWS], c[MROWS]; char *hap, *rs; - int *ihap; - int *irs; + int *ihap; + int *irs; } testcase; template std::string to_string(T obj) { - stringstream ss; - string ret_string; + std::stringstream ss; + std::string ret_string; ss.clear(); ss << std::scientific << obj; ss >> ret_string; @@ -148,66 +155,8 @@ std::string to_string(T obj) void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); int normalize(char c); - - -int read_testcase(testcase *tc) -{ - char *q, *i, *d, *c, *line = NULL; - int _q, _i, _d, _c; - int x, size = 0; - ssize_t read; - - read = getline(&line, (size_t *) &size, stdin); - if (read == -1) - return -1; - - tc->hap = (char *) malloc(size); - tc->rs = (char *) malloc(size); - q = (char *) malloc(size); - i = (char *) malloc(size); - d = (char *) malloc(size); - c = (char *) malloc(size); - tc->ihap = (int *) malloc(size*sizeof(int)); - tc->irs = (int *) malloc(size*sizeof(int)); - - - if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) - return -1; - - tc->haplen = strlen(tc->hap); - tc->rslen = strlen(tc->rs); - tc->q = (int *) malloc(sizeof(int) * tc->rslen); - tc->i = (int *) malloc(sizeof(int) * tc->rslen); - tc->d = (int *) malloc(sizeof(int) * tc->rslen); - tc->c = (int *) malloc(sizeof(int) * tc->rslen); - - tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); - tc->irs = (int *) malloc(tc->rslen*sizeof(int)); - - for (x = 0; x < tc->rslen; x++) - { - _q = normalize(q[x]); - _i = normalize(i[x]); - _d = normalize(d[x]); - _c = normalize(c[x]); - tc->q[x] = (_q < 6) ? 6 : _q; - tc->i[x] = _i; - tc->d[x] = _d; - tc->c[x] = _c; - tc->irs[x] = tc->rs[x]; - } - - for (x = 0; x < tc->haplen; x++) - tc->ihap[x] = tc->hap[x]; - - free(q); - free(i); - free(d); - free(c); - free(line); - - return 0; -} +int read_testcase(testcase *tc, FILE* ifp); +int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); #define NUM_DISTINCT_CHARS 5 #define AMBIG_CHAR 4 From 90938b86100dcfa8c63f26ac7b0c6ca6ec0f0dad Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Thu, 16 Jan 2014 19:58:04 -0800 Subject: [PATCH 4/6] Minor typo in comments fixed --- ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 78 +------------------ 1 file changed, 1 insertion(+), 77 deletions(-) diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index b8298161e..852caa00d 100644 --- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -316,7 +316,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy); assert(likelihoodDoubleArray && "likelihoodArray is NULL"); assert(env->GetArrayLength(likelihoodArray) == numTestCases); -#pragma omp parallel for schedule (guided,2) private(tc_idx) num_threads(maxNumThreadsToUse) +#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) for(tc_idx=0;tc_idx(&(tc_array[tc_idx])); @@ -347,79 +347,3 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai tc_array.clear(); } - - - - /* - protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final boolean recacheReadValues, - final byte[] nextHaploytpeBases) { - - paddedReadLength = readBases.length + 1; - paddedHaplotypeLength = haplotypeBases.length + 1; - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, true, 0); - if ( ! MathUtils.goodLog10Probability(result) ) - throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); - // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). - // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. - // KG: seems pointless is not being used anywhere - previousHaplotypeBases = haplotypeBases; - return result; - return 0; - }*/ - -//public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) - //{ - ////PerReadAlleleLikelihoodMap retValue = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); - //// (re)initialize the pairHMM only if necessary - //final int readMaxLength = findMaxReadLength(reads); - //final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); - //if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) - //{ initialize(readMaxLength, haplotypeMaxLength); } - - //if ( ! initialized ) - //throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods"); - //final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); - //jniComputeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap, likelihoodMap); - //for(GATKSAMRecord read : reads){ - //final byte[] readBases = read.getReadBases(); - //final byte[] readQuals = read.getBaseQualities(); - //final byte[] readInsQuals = read.getBaseInsertionQualities(); - //final byte[] readDelQuals = read.getBaseDeletionQualities(); - //final byte[] overallGCP = GCPArrayMap.get(read); - - //// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - //byte[] currentHaplotypeBases = null; - //boolean isFirstHaplotype = true; - //Allele currentAllele = null; - //double log10l; - //for (final Allele allele : alleleHaplotypeMap.keySet()){ - //final Haplotype haplotype = alleleHaplotypeMap.get(allele); - //final byte[] nextHaplotypeBases = haplotype.getBases(); - //if (currentHaplotypeBases != null) { - //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); - //likelihoodMap.add(read, currentAllele, log10l); - //} - //// update the current haplotype - //currentHaplotypeBases = nextHaplotypeBases; - //currentAllele = allele; - //} - //// process the final haplotype - //if (currentHaplotypeBases != null) { - - //// there is no next haplotype, so pass null for nextHaplotypeBases. - //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, - //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); - //likelihoodMap.add(read, currentAllele, log10l); - //} - //} - //if(debug) - //debugClose(); - //return likelihoodMap; - //} From 532485ca599cc38fdf9c92a61be51a080f268c18 Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Thu, 16 Jan 2014 20:26:41 -0800 Subject: [PATCH 5/6] Removed unnecessary files --- CHEATSHEET | 2 -- README | 41 ----------------------------------------- 2 files changed, 43 deletions(-) delete mode 100644 CHEATSHEET delete mode 100644 README diff --git a/CHEATSHEET b/CHEATSHEET deleted file mode 100644 index f61bfdd7f..000000000 --- a/CHEATSHEET +++ /dev/null @@ -1,2 +0,0 @@ -/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX; ./pairhmm-template-main.o < mediumTest.in > med.out; head -n -5 med.out > medium.out; lua checkLikelihoods.lua medium.out ~/hmm/mediumTest.out 6e-5 - diff --git a/README b/README deleted file mode 100644 index e16df5a8d..000000000 --- a/README +++ /dev/null @@ -1,41 +0,0 @@ -How to compile: -/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX - ------------------------------------- -How to run: -./pairhmm-template-main.o < ./test_data/largeTest.in > large.out -./pairhmm-template-main.o < ./test_data/mediumTest.in > medium.out -------------------------------------- -Results: -------------------------------------- -MEDIUM FLOAT: -AVX Read Time: 8019020 -AVX Compute Time: 29270238 -AVX Write Time: 242170 -AVX Total Time: 37531428 -# Double called: 0 -------------------------------------- -MEDIUM HYBRID: -AVX Read Time: 7596629 -AVX Compute Time: 29979220 -AVX Write Time: 248485 -AVX Total Time: 37824334 -# Double called: 20329 -------------------------------------- -LARGE FLOAT: -AVX Read Time: 111888586 -AVX Compute Time: 408131806 -AVX Write Time: 3477892 -AVX Total Time: 523498284 -# Double called: 0 -------------------------------------- -LARGE HYBRID: -AVX Read Time: 109728994 -AVX Compute Time: 419475317 -AVX Write Time: 3575364 -AVX Total Time: 532779675 -# Double called: 310042 - - - - From e6c6f8e313525e833c65b913d0fd5031cb855f19 Mon Sep 17 00:00:00 2001 From: Karthik Gururaj Date: Thu, 16 Jan 2014 20:28:50 -0800 Subject: [PATCH 6/6] Renamed directory --- .gitignore => PairHMM_JNI/.gitignore | 0 Makefile => PairHMM_JNI/Makefile | 0 convert_char.cc => PairHMM_JNI/convert_char.cc | 0 define-double.h => PairHMM_JNI/define-double.h | 0 define-float.h => PairHMM_JNI/define-float.h | 0 hmm_mask.cc => PairHMM_JNI/hmm_mask.cc | 0 .../org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc | 0 .../org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h | 0 pairhmm-1-base.cc => PairHMM_JNI/pairhmm-1-base.cc | 0 .../pairhmm-template-kernel.cc | 0 pairhmm-template-main.cc => PairHMM_JNI/pairhmm-template-main.cc | 0 shift_template.c => PairHMM_JNI/shift_template.c | 0 template.h => PairHMM_JNI/template.h | 0 13 files changed, 0 insertions(+), 0 deletions(-) rename .gitignore => PairHMM_JNI/.gitignore (100%) rename Makefile => PairHMM_JNI/Makefile (100%) rename convert_char.cc => PairHMM_JNI/convert_char.cc (100%) rename define-double.h => PairHMM_JNI/define-double.h (100%) rename define-float.h => PairHMM_JNI/define-float.h (100%) rename hmm_mask.cc => PairHMM_JNI/hmm_mask.cc (100%) rename org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc => PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc (100%) rename org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h => PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h (100%) rename pairhmm-1-base.cc => PairHMM_JNI/pairhmm-1-base.cc (100%) rename pairhmm-template-kernel.cc => PairHMM_JNI/pairhmm-template-kernel.cc (100%) rename pairhmm-template-main.cc => PairHMM_JNI/pairhmm-template-main.cc (100%) rename shift_template.c => PairHMM_JNI/shift_template.c (100%) rename template.h => PairHMM_JNI/template.h (100%) diff --git a/.gitignore b/PairHMM_JNI/.gitignore similarity index 100% rename from .gitignore rename to PairHMM_JNI/.gitignore diff --git a/Makefile b/PairHMM_JNI/Makefile similarity index 100% rename from Makefile rename to PairHMM_JNI/Makefile diff --git a/convert_char.cc b/PairHMM_JNI/convert_char.cc similarity index 100% rename from convert_char.cc rename to PairHMM_JNI/convert_char.cc diff --git a/define-double.h b/PairHMM_JNI/define-double.h similarity index 100% rename from define-double.h rename to PairHMM_JNI/define-double.h diff --git a/define-float.h b/PairHMM_JNI/define-float.h similarity index 100% rename from define-float.h rename to PairHMM_JNI/define-float.h diff --git a/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc similarity index 100% rename from hmm_mask.cc rename to PairHMM_JNI/hmm_mask.cc diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc similarity index 100% rename from org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc rename to PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h similarity index 100% rename from org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h rename to PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h diff --git a/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc similarity index 100% rename from pairhmm-1-base.cc rename to PairHMM_JNI/pairhmm-1-base.cc diff --git a/pairhmm-template-kernel.cc b/PairHMM_JNI/pairhmm-template-kernel.cc similarity index 100% rename from pairhmm-template-kernel.cc rename to PairHMM_JNI/pairhmm-template-kernel.cc diff --git a/pairhmm-template-main.cc b/PairHMM_JNI/pairhmm-template-main.cc similarity index 100% rename from pairhmm-template-main.cc rename to PairHMM_JNI/pairhmm-template-main.cc diff --git a/shift_template.c b/PairHMM_JNI/shift_template.c similarity index 100% rename from shift_template.c rename to PairHMM_JNI/shift_template.c diff --git a/template.h b/PairHMM_JNI/template.h similarity index 100% rename from template.h rename to PairHMM_JNI/template.h