diff --git a/.gitignore b/.gitignore index 65f111587..ede62ef81 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,16 @@ dump/ lib/ out/ /atlassian-ide-plugin.xml +kg_tmp/ +maven-ant-tasks-2.1.3.jar +null-sequenceGraph.25.0.0.raw_readthreading_graph.dot +null-sequenceGraph.25.0.1.cleaned_readthreading_graph.dot +null-sequenceGraph.25.0.1.initial_seqgraph.dot +null-sequenceGraph.3.0.0.raw_readthreading_graph.dot +null-sequenceGraph.3.0.1.cleaned_readthreading_graph.dot +null-sequenceGraph.3.0.1.initial_seqgraph.dot +org/ +package-list +resources/ +velocity.log + diff --git a/PairHMM_JNI/.gitignore b/PairHMM_JNI/.gitignore index 9722ad970..ce903a6fa 100644 --- a/PairHMM_JNI/.gitignore +++ b/PairHMM_JNI/.gitignore @@ -1,6 +1,6 @@ .svn *.o -#*.so +*.so tests .deps hmm_Mohammad @@ -8,3 +8,6 @@ pairhmm-template-main *.swp checker reformat +subdir_checkout.sh +avx/ +sse/ diff --git a/PairHMM_JNI/LoadTimeInitializer.cc b/PairHMM_JNI/LoadTimeInitializer.cc new file mode 100644 index 000000000..534043371 --- /dev/null +++ b/PairHMM_JNI/LoadTimeInitializer.cc @@ -0,0 +1,76 @@ +#include "LoadTimeInitializer.h" +#include "utils.h" +using namespace std; + +LoadTimeInitializer g_load_time_initializer; + +LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded +{ + ConvertChar::init(); + m_sumNumReads = 0; + m_sumSquareNumReads = 0; + m_sumNumHaplotypes = 0; + m_sumSquareNumHaplotypes = 0; + m_sumNumTestcases = 0; + m_sumSquareNumTestcases = 0; + m_maxNumTestcases = 0; + m_num_invocations = 0; + m_compute_time = 0; + m_data_transfer_time = 0; + + m_filename_to_fptr.clear(); + + initialize_function_pointers(); + cout.flush(); +} + +void LoadTimeInitializer::print_profiling() +{ + double mean_val; + cout << "Compute time "<::iterator mI = m_filename_to_fptr.find(filename); + ofstream* fptr = 0; + if(mI == m_filename_to_fptr.end()) + { + m_filename_to_fptr[filename] = new ofstream(); + fptr = m_filename_to_fptr[filename]; + fptr->open(filename.c_str(), to_append ? ios::app : ios::out); + assert(fptr->is_open()); + } + else + fptr = (*mI).second; + //ofstream fptr; + //fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + (*fptr) << s; + if(add_newline) + (*fptr) << "\n"; + //fptr.close(); +} +void LoadTimeInitializer::debug_close() +{ + for(map::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end(); + mB != mE;mB++) + { + (*mB).second->close(); + delete (*mB).second; + } + m_filename_to_fptr.clear(); +} + diff --git a/PairHMM_JNI/LoadTimeInitializer.h b/PairHMM_JNI/LoadTimeInitializer.h new file mode 100644 index 000000000..cbda057a8 --- /dev/null +++ b/PairHMM_JNI/LoadTimeInitializer.h @@ -0,0 +1,37 @@ +#ifndef LOAD_TIME_INITIALIZER_H +#define LOAD_TIME_INITIALIZER_H +#include "headers.h" +#include +class LoadTimeInitializer +{ + public: + LoadTimeInitializer(); //will be called when library is loaded + void print_profiling(); + void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + void debug_close(); + + jfieldID m_readBasesFID; + jfieldID m_readQualsFID; + jfieldID m_insertionGOPFID; + jfieldID m_deletionGOPFID; + jfieldID m_overallGCPFID; + jfieldID m_haplotypeBasesFID; + //used to compute avg, variance of #testcases + double m_sumNumReads; + double m_sumSquareNumReads; + double m_sumNumHaplotypes; + double m_sumSquareNumHaplotypes; + double m_sumNumTestcases; + double m_sumSquareNumTestcases; + unsigned m_maxNumTestcases; + unsigned m_num_invocations; + //timing + uint64_t m_compute_time; + uint64_t m_data_transfer_time; + private: + std::map m_filename_to_fptr; +}; +extern LoadTimeInitializer g_load_time_initializer; + + +#endif diff --git a/PairHMM_JNI/Makefile b/PairHMM_JNI/Makefile index 70bf96dd3..9b461a833 100644 --- a/PairHMM_JNI/Makefile +++ b/PairHMM_JNI/Makefile @@ -1,5 +1,5 @@ -OMPCFLAGS=-fopenmp -#OMPLDFLAGS=-lgomp +#OMPCFLAGS=-fopenmp +#OMPLFLAGS=-fopenmp #-openmp-link static #CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas #CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas @@ -7,40 +7,59 @@ OMPCFLAGS=-fopenmp JAVA_ROOT=/opt/jdk1.7.0_25/ JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux -CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX - -CXXFLAGS=$(CFLAGS) +COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas CC=icc CXX=icc -LDFLAGS=-lm $(OMPLDFLAGS) +LDFLAGS=-lm -lrt $(OMPLDFLAGS) -#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev -BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker +BIN=libJNILoglessPairHMM.so pairhmm-template-main checker +#BIN=checker -#SOURCES=pairhmm-1-base.cc input.cc -LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc -SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc -LIBOBJECTS=$(LIBSOURCES:.cc=.o) DEPDIR=.deps DF=$(DEPDIR)/$(*).d +#Common across libJNI and sandbox +COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc sse_function_instantiations.cc LoadTimeInitializer.cc +#Part of libJNI +LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc $(COMMON_SOURCES) +SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc +LIBOBJECTS=$(LIBSOURCES:.cc=.o) +COMMON_OBJECTS=$(COMMON_SOURCES:.cc=.o) + + +#No vectorization for these files +NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc pairhmm-template-main.cc pairhmm-1-base.cc utils.cc baseline.cc LoadTimeInitializer.cc +#Use -xAVX for these files +AVX_SOURCES=avx_function_instantiations.cc +#Use -xSSE4.2 for these files +SSE_SOURCES=sse_function_instantiations.cc + +NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o) +AVX_OBJECTS=$(AVX_SOURCES:.cc=.o) +SSE_OBJECTS=$(SSE_SOURCES:.cc=.o) +$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) +$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX +$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2 +OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS) + all: $(BIN) -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) -checker: pairhmm-1-base.o convert_char.o - $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) +checker: pairhmm-1-base.o $(COMMON_OBJECTS) + $(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS) -pairhmm-template-main: pairhmm-template-main.o convert_char.o - $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) +pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS) + $(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS) libJNILoglessPairHMM.so: $(LIBOBJECTS) - $(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) + $(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS} -%.o: %.cc + +$(OBJECTS): %.o: %.cc @mkdir -p $(DEPDIR) - $(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $< + $(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $< clean: diff --git a/PairHMM_JNI/avx_function_instantiations.cc b/PairHMM_JNI/avx_function_instantiations.cc new file mode 100644 index 000000000..4118fc5cf --- /dev/null +++ b/PairHMM_JNI/avx_function_instantiations.cc @@ -0,0 +1,19 @@ +#include "template.h" + +#undef SIMD_TYPE +#undef SIMD_TYPE_SSE + +#define SIMD_TYPE avx +#define SIMD_TYPE_AVX + +#include "define-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +template double compute_full_prob_avxd(testcase* tc, double* nextlog); +template float compute_full_prob_avxs(testcase* tc, float* nextlog); + diff --git a/PairHMM_JNI/baseline.cc b/PairHMM_JNI/baseline.cc new file mode 100644 index 000000000..b953c4436 --- /dev/null +++ b/PairHMM_JNI/baseline.cc @@ -0,0 +1,85 @@ +#include "headers.h" +#include "template.h" + +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) +{ + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + + Context ctx; + + NUMBER M[MROWS][MCOLS]; + NUMBER X[MROWS][MCOLS]; + NUMBER Y[MROWS][MCOLS]; + NUMBER p[MROWS][6]; + + p[0][MM] = ctx._(0.0); + p[0][GapM] = ctx._(0.0); + p[0][MX] = ctx._(0.0); + p[0][XX] = ctx._(0.0); + p[0][MY] = ctx._(0.0); + p[0][YY] = ctx._(0.0); + for (r = 1; r < ROWS; r++) + { + int _i = tc->i[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; + p[r][MX] = ctx.ph2pr[_i]; + p[r][XX] = ctx.ph2pr[_c]; + p[r][MY] = ctx.ph2pr[_d]; + p[r][YY] = ctx.ph2pr[_c]; + //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + + NUMBER result = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + for (c = 1; c < COLS; c++) + { + char _rs = tc->rs[r-1]; + char _hap = tc->hap[c-1]; + int _q = tc->q[r-1] & 127; + NUMBER distm = ctx.ph2pr[_q]; + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; + else + distm = distm/3; + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; + } + + for (c = 0; c < COLS; c++) + { + result += M[ROWS-1][c] + X[ROWS-1][c]; + } + + if (before_last_log != NULL) + *before_last_log = result; + + return result; + //return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +template double compute_full_prob(testcase* tc, double* nextbuf); +template float compute_full_prob(testcase* tc, float* nextbuf); + diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h index 5d31741d6..502b919fe 100644 --- a/PairHMM_JNI/define-double.h +++ b/PairHMM_JNI/define-double.h @@ -18,35 +18,36 @@ #undef MASK_TYPE #undef MASK_ALL_ONES - #undef SET_VEC_ZERO - #undef VEC_OR - #undef VEC_ADD - #undef VEC_SUB - #undef VEC_MUL - #undef VEC_DIV - #undef VEC_BLEND - #undef VEC_BLENDV - #undef VEC_CAST_256_128 - #undef VEC_EXTRACT_128 - #undef VEC_EXTRACT_UNIT - #undef VEC_SET1_VAL128 - #undef VEC_MOVE - #undef VEC_CAST_128_256 - #undef VEC_INSERT_VAL - #undef VEC_CVT_128_256 - #undef VEC_SET1_VAL - #undef VEC_POPCVT_CHAR - #undef VEC_LDPOPCVT_CHAR - #undef VEC_CMP_EQ - #undef VEC_SET_LSE - #undef SHIFT_HAP - #undef print256b + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) #undef MASK_VEC - #undef VEC_SSE_TO_AVX - #undef VEC_SHIFT_LEFT_1BIT + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) #undef MASK_ALL_ONES - #undef COMPARE_VECS + #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE + #undef BITMASK_VEC #endif #define PRECISION d @@ -83,7 +84,7 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_pd(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ +#define VEC_DIV(__v1, __v2) \ _mm256_div_pd(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ @@ -156,3 +157,32 @@ } \ } \ } + +class BitMaskVec_double { + + MASK_VEC low_, high_ ; + _256_TYPE combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const _256_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_double diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h index 0e2822b2d..3cc57ec38 100644 --- a/PairHMM_JNI/define-float.h +++ b/PairHMM_JNI/define-float.h @@ -47,6 +47,7 @@ #undef MASK_ALL_ONES #undef COMPARE_VECS(__v1, __v2) #undef _256_INT_TYPE + #undef BITMASK_VEC #endif #define PRECISION s @@ -84,7 +85,7 @@ #define VEC_MUL(__v1, __v2) \ _mm256_mul_ps(__v1, __v2) -#define VEC_DIV(__v1, __v2) \ +#define VEC_DIV(__v1, __v2) \ _mm256_div_ps(__v1, __v2) #define VEC_BLEND(__v1, __v2, __mask) \ @@ -133,7 +134,7 @@ _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); #define SHIFT_HAP(__v1, __val) \ - _vector_shift_lasts(__v1, __val.f); + _vector_shift_lastavxs(__v1, __val.f); #define print256b(__v1) \ print256bFP(__v1) @@ -157,3 +158,31 @@ } \ } +class BitMaskVec_float { + + MASK_VEC low_, high_ ; + _256_TYPE combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const _256_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_float diff --git a/PairHMM_JNI/define-sse-double.h b/PairHMM_JNI/define-sse-double.h new file mode 100644 index 000000000..a30b2e5f5 --- /dev/null +++ b/PairHMM_JNI/define-sse-double.h @@ -0,0 +1,153 @@ +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_INSERT_UNIT(__v1,__ins,__im) + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE + #undef BITMASK_VEC +#endif + +#define SSE +#define PRECISION d + +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D128 +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 1 +#define SHIFT_CONST2 8 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128d +#define _256_TYPE __m128d +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 2 +#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL +#define MASK_VEC MaskVec_D + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi64(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_pd(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_pd(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_pd(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_pd(zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +class BitMaskVec_sse_double { + + MASK_VEC combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const _256_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_double + diff --git a/PairHMM_JNI/define-sse-float.h b/PairHMM_JNI/define-sse-float.h new file mode 100644 index 000000000..6612b28e6 --- /dev/null +++ b/PairHMM_JNI/define-sse-float.h @@ -0,0 +1,151 @@ +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_INSERT_UNIT(__v1,__ins,__im) + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE + #undef BITMASK_VEC +#endif + +#define SSE +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F128 +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 3 +#define SHIFT_CONST2 4 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128 +#define _256_TYPE __m128 +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 4 +#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi32(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_ps(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_ps(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_ps(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lastsses(__v1, __val.f) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_ps(zero, zero, zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +class BitMaskVec_sse_float { + + MASK_VEC combined_ ; + +public: + + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const _256_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_float diff --git a/PairHMM_JNI/headers.h b/PairHMM_JNI/headers.h new file mode 100644 index 000000000..f502ff5ce --- /dev/null +++ b/PairHMM_JNI/headers.h @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + diff --git a/PairHMM_JNI/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc deleted file mode 100644 index c335a0160..000000000 --- a/PairHMM_JNI/hmm_mask.cc +++ /dev/null @@ -1,451 +0,0 @@ -#include -#include -#include -#include - - -// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static - - -#include - -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include -#include "template.h" - -using namespace std ; - -#define CHECK_MASK_CORRECTNESS - -template -string getBinaryStr (T val, int numBitsToWrite) { - - ostringstream oss ; - uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; - for (int i=numBitsToWrite-1; i >= 0; --i) { - oss << ((val & mask) >> i) ; - mask >>= 1 ; - } - return oss.str() ; -} - -int normalize(char c) -{ - return ((int) (c - 33)); -} - - -uint8_t ConvertChar::conversionTable[255] ; - -int read_testcase(testcase *tc, FILE* ifp) -{ - char *q, *i, *d, *c, *line = NULL; - int _q, _i, _d, _c; - int x, size = 0; - ssize_t read; - - read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); - if (read == -1) - return -1; - - - tc->hap = (char *) malloc(size); - tc->rs = (char *) malloc(size); - q = (char *) malloc(size); - i = (char *) malloc(size); - d = (char *) malloc(size); - c = (char *) malloc(size); - - if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) - return -1; - - tc->haplen = strlen(tc->hap); - tc->rslen = strlen(tc->rs); - assert(tc->rslen < MROWS); - tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); - tc->irs = (int *) malloc(tc->rslen*sizeof(int)); - - //tc->q = (int *) malloc(sizeof(int) * tc->rslen); - //tc->i = (int *) malloc(sizeof(int) * tc->rslen); - //tc->d = (int *) malloc(sizeof(int) * tc->rslen); - //tc->c = (int *) malloc(sizeof(int) * tc->rslen); - - for (x = 0; x < tc->rslen; x++) - { - _q = normalize(q[x]); - _i = normalize(i[x]); - _d = normalize(d[x]); - _c = normalize(c[x]); - tc->q[x] = (_q < 6) ? 6 : _q; - tc->i[x] = _i; - tc->d[x] = _d; - tc->c[x] = _c; - tc->irs[x] = tc->rs[x]; - } - for (x = 0; x < tc->haplen; x++) - tc->ihap[x] = tc->hap[x]; - - - free(q); - free(i); - free(d); - free(c); - free(line); - - return 0; -} - - - - -#define ALIGN __attribute__ ((aligned(32))) - - -typedef union ALIGN { - //__m256i vi; - __m128 vf ; - __m128i vi ; - __m128d vd; - - uint8_t b[16]; - //uint16_t hw[8] ; - uint32_t w[4] ; - uint64_t dw[2] ; - - float f[4] ; - //double d[2] ; - -} v128 ; - -typedef union ALIGN { - __m256i vi; - __m256 vf ; - __m256d vd; - //__m128i vi128[2] ; - - uint8_t b[32]; - //uint16_t hw[16] ; - uint32_t w[8] ; - uint64_t dw[4] ; - - float f[8] ; - double d[4] ; - -} v256 ; - - -#define NUM_DISTINCT_CHARS 5 -#define AMBIG_CHAR 4 - -#define VEC_ENTRY_CNT 8 -#define VEC_LEN 256 -#define VTYPE vf -#define VTYPEI vi -#define VENTRY f -#define VENTRYI w -#define MTYPE uint32_t -#define MASK_ALL_ONES 0xFFFFFFFF - - -#define VECTOR v256 -#define VECTOR_SSE v128 - -#define SET_VEC_ZERO(__vec) \ - __vec= _mm256_setzero_ps() - -#define SET_VEC_ONES(__vec) \ - __vec = _mm256_set1_epi32(0xFFFFFFFF) - -#define VEC_OR(__v1, __v2) \ - _mm256_or_ps(__v1, __v2) - -#define VEC_ADD(__v1, __v2) \ - _mm256_add_ps(__v1, __v2) - -#define VEC_MUL(__v1, __v2) \ - _mm256_mul_ps(__v1, __v2) - -#define VEC_BLENDV(__v1, __v2, __maskV) \ - _mm256_blendv_ps(__v1, __v2, __maskV) - -#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ - __vdst = _mm256_castps128_ps256(__vsLow) ; \ - __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; - -#define VECS_SHIFT_LEFT_1BIT(__vs) \ - __vs = _mm_slli_epi32(__vs, 1) - - -#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \ - uint64_t __mask = ; \ - uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \ - __shiftWord <<= (64-cnt) ; \ - __vs = _mm_slri_epi64(__vs, __cnt) ; \ - __vs.dw[0] |= __shiftWord ; \ - } - - -#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ - MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ - MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ - __mask >>= __shiftBy ; \ - __mask |= __lastShiftOut ; \ - __lastShiftOut = __nextShiftOut ; \ - } - - -#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ - MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ - MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ - __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ - __lastShiftOut = __nextShiftOut ; \ -} - - -void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs, - MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) { - - const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; - - for (int vi=0; vi < numMaskVecs; ++vi) { - for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { - maskArr[vi][rs] = 0 ; - } - maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; - } - - for (int col=1; col < COLS; ++col) { - int mIndex = (col-1) / maskBitCnt ; - int mOffset = (col-1) % maskBitCnt ; - MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ; - - char hapChar = tc.hap[col-1] ; - - if (hapChar == AMBIG_CHAR) { - maskArr[mIndex][0] |= bitMask ; - maskArr[mIndex][1] |= bitMask ; - maskArr[mIndex][2] |= bitMask ; - maskArr[mIndex][3] |= bitMask ; - } - - //cout << hapChar << " " << mIndex << " " << getBinaryStr(bitMask, 32) - // << endl ; - //cout << getBinaryStr(maskArr[0][hapChar],32) << endl ; - //exit(0) ; - - maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; - - - // bit corresponding to col 1 will be the MSB of the mask 0 - // bit corresponding to col 2 will be the MSB-1 of the mask 0 - // ... - // bit corresponding to col 32 will be the LSB of the mask 0 - // bit corresponding to col 33 will be the MSB of the mask 1 - // ... - } - -} - - -void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { - - int ROWS = tc.rslen + 1 ; - int COLS = tc.haplen + 1 ; - - // only for testing - VECTOR mismatchData, matchData ; - //SET_VEC_ZERO(mismatchData.VTYPE) ; - //SET_VEC_ONES(matchData.VTYPEI) ; - for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) { - matchData.VENTRY[ei] = 1.0 ; - mismatchData.VENTRY[ei] = 0.0 ; - } - - const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; - const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function - - MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; - precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ; - -#ifdef DEBUG - if (printDebug) { - cout << "The first 32 hap chars are: " ; - for (int i=0; i < 32; ++i) { - cout << tc.hap[i] ; - } - cout << endl ; - - cout << "Masks computed for A, C, T, G, N are: " << endl ; - cout << getBinaryStr(maskArr[0][0], 32) << endl ; - cout << getBinaryStr(maskArr[0][1], 32) << endl ; - cout << getBinaryStr(maskArr[0][2], 32) << endl ; - cout << getBinaryStr(maskArr[0][3], 32) << endl ; - cout << getBinaryStr(maskArr[0][4], 32) << endl ; - } -#endif // #ifdef DEBUG - - int beginRowIndex = 1 ; - while (beginRowIndex < ROWS) { - - int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ; - - char rsArr[VEC_ENTRY_CNT] ; - for (int ri=0; ri < numRowsToProcess; ++ri) { - rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; - } - - // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors - VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3 - VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7 - - MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ; - for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) - lastMaskShiftOut[ei] = 0 ; - - int col = 1 ; - int diag = 1 ; - for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) { - // set up the mask vectors for the next maskBitCnt columns - - // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols) - for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) { - SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], - lastMaskShiftOut[ei], ei, maskBitCnt) ; - - int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index - SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], - lastMaskShiftOut[ei2], ei2, maskBitCnt) ; - } - -#ifdef DEBUG - if (printDebug && maskIndex == 0) { - cout << "The masks for entry 1: " << endl - << getBinaryStr(maskArr[0][rsArr[1]], 32) << endl - << getBinaryStr(currMaskVecLow.VENTRYI[1], 32) << endl ; - - } -#endif // #ifdef DEBUG - - // iterate over mask bit indices and columns - for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) { - - VECTOR maskV ; - VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ; - - VECTOR testData ; - testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE, - maskV.VTYPE) ; - - VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ; - VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ; - -#ifdef DEBUG - if (printDebug && maskIndex == 0) { - cout << "The mask for entry 1, mbi=" << mbi << ": " - << getBinaryStr(maskV.VENTRYI[1], 32) << endl ; - - } -#endif // #ifdef DEBUG - -#ifdef CHECK_MASK_CORRECTNESS - - int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ; - int lastRowIndex = min(col-1, numRowsToProcess-1) ; - - for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) { - int currRow = beginRowIndex + ri ; - int currCol = col - ri + firstRowIndex ; - - char hapChar = tc.hap[currCol-1] ; - char rsChar = tc.rs[currRow-1] ; - - bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ; - - if ((bool) testData.VENTRYI[ri] != match) { - cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag - << " (" << currRow << ", " << currCol << ")" << endl ; - - cout << "The chars are: " << hapChar << " and " << rsChar << endl ; - cout << "The selected value is: " << testData.VENTRYI[ri] << endl ; - - exit(0) ; - } - } -#endif // #ifdef CHECK_MASK_CORRECTNESS - - if (diag < COLS) - ++col ; - - } // mbi - } // maskIndex - - beginRowIndex += VEC_ENTRY_CNT ; - } // end of stripe - - //cout << "Finished validating entry " << endl ; -} - -#ifdef HMM_MASK_MAIN -int main () { - - #define BATCH_SIZE 10000 - - ConvertChar::init() ; - - testcase* tcBatch = new testcase[BATCH_SIZE] ; - - int numBatches = 0 ; - int numRead = 0 ; - - const int DEBUG_TC = -1 ; - - FILE* ifp = stdin ; - - do { - numRead = 0 ; - - while (numRead < BATCH_SIZE) { - if (read_testcase(tcBatch+numRead, ifp) < 0) - break ; - - ++numRead ; - } - - if (numRead == 0) - break ; - - for (int ti=0; ti < numRead; ++ti) { - - int tcID = numBatches * BATCH_SIZE + ti ; - test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ; - } - - ++numBatches ; - } while (numRead == BATCH_SIZE) ; - - fclose(ifp) ; - - delete[] tcBatch ; - - return 0 ; -} -#endif diff --git a/PairHMM_JNI/libJNILoglessPairHMM.so b/PairHMM_JNI/libJNILoglessPairHMM.so deleted file mode 100755 index 435034f78..000000000 Binary files a/PairHMM_JNI/libJNILoglessPairHMM.so and /dev/null differ diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc index 59d00928e..d575b8271 100644 --- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -1,70 +1,19 @@ +#include "headers.h" #include -#include -#include #include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -using namespace std; #define ENABLE_ASSERTIONS 1 +#define DO_PROFILING 1 //#define DEBUG 1 //#define DEBUG0_1 1 //#define DEBUG3 1 - #include "template.h" +#include "utils.h" +#include "LoadTimeInitializer.h" -#include "define-double.h" -#include "shift_template.c" -#include "pairhmm-template-kernel.cc" - - - -#define MM 0 -#define GapM 1 -#define MX 2 -#define XX 3 -#define MY 4 -#define YY 5 - -class LoadTimeInitializer -{ - public: - LoadTimeInitializer() //will be called when library is loaded - { - ConvertChar::init(); - } - jfieldID m_readBasesFID; - jfieldID m_readQualsFID; - jfieldID m_insertionGOPFID; - jfieldID m_deletionGOPFID; - jfieldID m_overallGCPFID; - jfieldID m_haplotypeBasesFID; -}; -LoadTimeInitializer g_load_time_initializer; - -void debug_dump(string filename, string s, bool to_append, bool add_newline) -{ - ofstream fptr; - fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); - fptr << s; - if(add_newline) - fptr << "\n"; - fptr.close(); -} +using namespace std; JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities (JNIEnv* env, jclass thisObject, @@ -93,7 +42,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior //Gets direct access to Java arrays #define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical #define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical -#define JNI_RELEASE_MODE JNI_ABORT +#define JNI_RO_RELEASE_MODE JNI_ABORT #define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical #define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical @@ -101,7 +50,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior //Likely makes copy of Java arrays to JNI C++ space #define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements #define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements -#define JNI_RELEASE_MODE JNI_ABORT +#define JNI_RO_RELEASE_MODE JNI_ABORT #define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements #define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements @@ -145,19 +94,19 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadL tc.c[i] = (int)overallGCPArray[i]; } - double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result_avxd = g_compute_full_prob_double(&tc, 0); double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); #ifdef DEBUG - debug_dump("return_values_jni.txt",to_string(result),true); + g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(result),true); #endif - RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE); - RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE); - RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE); - RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE); - RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE); - RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RO_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RO_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RO_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RO_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RO_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RO_RELEASE_MODE); return 0.0; } @@ -191,6 +140,46 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai g_load_time_initializer.m_haplotypeBasesFID = fid; } +//Since the list of haplotypes against which the reads are evaluated in PairHMM is the same for a region, +//transfer the list only once +vector > g_haplotypeBasesArrayVector; +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes + (JNIEnv * env, jobject thisObject, jint numHaplotypes, jobjectArray haplotypeDataArray) +{ + jboolean is_copy = JNI_FALSE; + //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + haplotypeBasesArrayVector.clear(); + haplotypeBasesArrayVector.resize(numHaplotypes); + for(unsigned j=0;jGetObjectArrayElement(haplotypeDataArray, j); + jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str()); +#endif + //Need a global reference as this will be accessed across multiple JNI calls to JNIComputeLikelihoods() + jbyteArray haplotypeBasesGlobalRef = (jbyteArray)env->NewGlobalRef(haplotypeBases); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBasesGlobalRef && ("Could not get global ref to haplotypeBases at index : "+to_string(j)+"\n").c_str()); +#endif + env->DeleteLocalRef(haplotypeBases); //free the local reference + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBasesGlobalRef, &is_copy); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(env->GetArrayLength(haplotypeBasesGlobalRef) < MCOLS); +#endif +#ifdef DEBUG0_1 + cout << "JNI haplotype length "<GetArrayLength(haplotypeBasesGlobalRef)<<"\n"; +#endif + haplotypeBasesArrayVector[j] = make_pair(haplotypeBasesGlobalRef, haplotypeBasesArray); +#ifdef DEBUG3 + for(unsigned k=0;kGetArrayLength(haplotypeBases);++k) + g_load_time_initializer.debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); +#endif + } +} + //JNI function to invoke compute_full_prob_avx //readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc //haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases @@ -200,37 +189,16 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) { - jboolean is_copy = JNI_FALSE; - //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector - vector > haplotypeBasesArrayVector; - haplotypeBasesArrayVector.clear(); - haplotypeBasesArrayVector.resize(numHaplotypes); #ifdef DEBUG0_1 cout << "JNI numReads "<GetObjectArrayElement(haplotypeDataArray, j); - jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID); -#ifdef ENABLE_ASSERTIONS - assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str()); -#endif - jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); -#ifdef ENABLE_ASSERTIONS - assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); - assert(env->GetArrayLength(haplotypeBases) < MCOLS); -#endif -#ifdef DEBUG0_1 - cout << "JNI haplotype length "<GetArrayLength(haplotypeBases)<<"\n"; -#endif - haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray); -#ifdef DEBUG3 - for(unsigned k=0;kGetArrayLength(haplotypeBases);++k) - debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); -#endif - } + double start_time = 0; + //haplotype vector from earlier store - note the reference to vector, not copying + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + jboolean is_copy = JNI_FALSE; unsigned numTestCases = numReads*numHaplotypes; + //vector to store results vector tc_array; tc_array.clear(); tc_array.resize(numTestCases); @@ -239,6 +207,9 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai vector > > readBasesArrayVector; readBasesArrayVector.clear(); readBasesArrayVector.resize(numReads); +#ifdef DO_PROFILING + start_time = get_time(); +#endif for(unsigned i=0;iGetArrayLength(likelihoodArray) == numTestCases); #endif +#ifdef DO_PROFILING + start_time = get_time(); +#endif #pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) for(tc_idx=0;tc_idx(&(tc_array[tc_idx])); - double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); + float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0); + double result = 0; + if (result_avxf < MIN_ACCEPTED) { + double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0); + result = log10(result_avxd) - log10(ldexp(1.0, 1020.0)); + } + else + result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f))); + likelihoodDoubleArray[tc_idx] = result; } +#ifdef DO_PROFILING + g_load_time_initializer.m_compute_time += get_time(); +#endif #ifdef DEBUG for(tc_idx=0;tc_idx=0;--i)//note the order - reverse of GET { - for(int j=readBasesArrayVector.size()-1;j>=0;--j) - RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE); + for(int j=readBasesArrayVector[i].size()-1;j>=0;--j) + RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE); readBasesArrayVector[i].clear(); } readBasesArrayVector.clear(); - - //Now release haplotype arrays - for(int j=numHaplotypes-1;j>=0;--j) //note the order - reverse of GET - RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE); - haplotypeBasesArrayVector.clear(); +#ifdef DO_PROFILING + g_load_time_initializer.m_data_transfer_time += get_time(); +#endif tc_array.clear(); +#ifdef DO_PROFILING + g_load_time_initializer.m_sumNumReads += numReads; + g_load_time_initializer.m_sumSquareNumReads += numReads*numReads; + g_load_time_initializer.m_sumNumHaplotypes += numHaplotypes; + g_load_time_initializer.m_sumSquareNumHaplotypes += numHaplotypes*numHaplotypes; + g_load_time_initializer.m_sumNumTestcases += numTestCases; + g_load_time_initializer.m_sumSquareNumTestcases += numTestCases*numTestCases; + g_load_time_initializer.m_maxNumTestcases = numTestCases > g_load_time_initializer.m_maxNumTestcases ? numTestCases + : g_load_time_initializer.m_maxNumTestcases; + ++(g_load_time_initializer.m_num_invocations); +#endif +#ifdef DEBUG + g_load_time_initializer.debug_close(); +#endif +} + +//Release haplotypes at the end of a region +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion + (JNIEnv * env, jobject thisObject) +{ + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + //Now release haplotype arrays + for(int j=haplotypeBasesArrayVector.size()-1;j>=0;--j) //note the order - reverse of GET + { + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RO_RELEASE_MODE); + env->DeleteGlobalRef(haplotypeBasesArrayVector[j].first); //free the global reference + } + haplotypeBasesArrayVector.clear(); +} + + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose + (JNIEnv* env, jobject thisObject) +{ +#ifdef DO_PROFILING + g_load_time_initializer.print_profiling(); +#endif +#ifdef DEBUG + g_load_time_initializer.debug_close(); +#endif } diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h index 1fff9fa5a..06ee99d7d 100644 --- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -21,6 +21,34 @@ extern "C" { #define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L #undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion #define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1 +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1 +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2 +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3 +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3 0L +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniGlobalInit + * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit + (JNIEnv *, jobject, jclass, jclass); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniClose + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose + (JNIEnv *, jobject); + /* * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM * Method: jniInitialize @@ -45,27 +73,34 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint); - /* * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM * Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10 * Signature: (II[B[B[B[B[B[BI)D */ JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 - (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); + (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); /* * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM - * Method: jniGlobalInit - * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V + * Method: jniInitializeHaplotypes + * Signature: (I[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;)V */ -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit - (JNIEnv *, jobject, jclass, jclass); +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes + (JNIEnv *, jobject, jint, jobjectArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniFinalizeRegion + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion + (JNIEnv *, jobject); /* * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM * Method: jniComputeLikelihoods - * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V + * Signature: (II[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[DI)V */ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); diff --git a/PairHMM_JNI/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc index 90f95aeec..aaf45848b 100644 --- a/PairHMM_JNI/pairhmm-1-base.cc +++ b/PairHMM_JNI/pairhmm-1-base.cc @@ -1,123 +1,19 @@ //#define DEBUG 1 //#define DEBUG0_1 1 //#define DEBUG3 1 -#define MM 0 -#define GapM 1 -#define MX 2 -#define XX 3 -#define MY 4 -#define YY 5 - -#include -#include -#include -#include -#include -#include +#include "headers.h" #include "template.h" - -//#include "define-float.h" -//#include "shift_template.c" -//#include "pairhmm-template-kernel.cc" - -#include "define-double.h" -#include "shift_template.c" -#include "pairhmm-template-kernel.cc" - +#include "utils.h" +#include "LoadTimeInitializer.h" using namespace std; -class LoadTimeInitializer -{ - public: - LoadTimeInitializer() //will be called when library is loaded - { - ConvertChar::init(); - } -}; -LoadTimeInitializer g_load_time_initializer; -template -NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) -{ - int r, c; - int ROWS = tc->rslen + 1; - int COLS = tc->haplen + 1; - - Context ctx; - - NUMBER M[MROWS][MCOLS]; - NUMBER X[MROWS][MCOLS]; - NUMBER Y[MROWS][MCOLS]; - NUMBER p[MROWS][6]; - - p[0][MM] = ctx._(0.0); - p[0][GapM] = ctx._(0.0); - p[0][MX] = ctx._(0.0); - p[0][XX] = ctx._(0.0); - p[0][MY] = ctx._(0.0); - p[0][YY] = ctx._(0.0); - for (r = 1; r < ROWS; r++) - { - int _i = tc->i[r-1] & 127; - int _d = tc->d[r-1] & 127; - int _c = tc->c[r-1] & 127; - p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; - p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; - p[r][MX] = ctx.ph2pr[_i]; - p[r][XX] = ctx.ph2pr[_c]; - p[r][MY] = ctx.ph2pr[_d]; - p[r][YY] = ctx.ph2pr[_c]; - //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; - } - - for (c = 0; c < COLS; c++) - { - M[0][c] = ctx._(0.0); - X[0][c] = ctx._(0.0); - Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); - } - - for (r = 1; r < ROWS; r++) - { - M[r][0] = ctx._(0.0); - X[r][0] = X[r-1][0] * p[r][XX]; - Y[r][0] = ctx._(0.0); - } - - NUMBER result = ctx._(0.0); - - for (r = 1; r < ROWS; r++) - for (c = 1; c < COLS; c++) - { - char _rs = tc->rs[r-1]; - char _hap = tc->hap[c-1]; - int _q = tc->q[r-1] & 127; - NUMBER distm = ctx.ph2pr[_q]; - if (_rs == _hap || _rs == 'N' || _hap == 'N') - distm = ctx._(1.0) - distm; - else - distm = distm/3; - M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); - X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; - Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; - } - - for (c = 0; c < COLS; c++) - { - result += M[ROWS-1][c] + X[ROWS-1][c]; - } - - if (before_last_log != NULL) - *before_last_log = result; - - return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; -} - #define BATCH_SIZE 10000 #define RUN_HYBRID + + int main(int argc, char** argv) { if(argc < 2) @@ -128,111 +24,82 @@ int main(int argc, char** argv) bool use_old_read_testcase = false; if(argc >= 3 && string(argv[2]) == "1") use_old_read_testcase = true; + unsigned chunk_size = 100; + if(argc >= 4) + chunk_size = strtol(argv[3],0,10); - testcase tc; + std::ifstream ifptr; + FILE* fptr = 0; if(use_old_read_testcase) { - FILE* fptr = fopen(argv[1],"r"); - while(!feof(fptr)) - { - if(read_testcase(&tc, fptr) >= 0) - { - double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); - double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); - - cout << std::scientific << compute_full_prob(&tc) << " "< tokens; ifptr.open(argv[1]); assert(ifptr.is_open()); - while(1) - { - tokens.clear(); - if(read_mod_testcase(ifptr, &tc, false) < 0) - break; - //double result = 0; - double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); - double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); - - cout << std::scientific << compute_full_prob(&tc) << " "< tc_vector; + tc_vector.clear(); + testcase tc; + uint64_t total_time = 0; + while(1) { - int read_count = BATCH_SIZE; - gettimeofday(&start, NULL); - for (int b=0;b= 0) + tc_vector.push_back(tc); + if(tc_vector.size() == BATCH_SIZE || (break_value < 0 && tc_vector.size() > 0)) { - result_avxf = compute_full_prob(&tc[b]); + vector results_vec; + results_vec.clear(); + results_vec.resize(tc_vector.size()); + get_time(); +#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12) + for(unsigned i=0;i(&tc[b]); - result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + results_vec[i] = result; } - else - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); -#endif - -#ifndef RUN_HYBRID - result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); -#endif - + total_time += get_time(); +#pragma omp parallel for schedule(dynamic,chunk_size) + for(unsigned i=0;i(&tc); + baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0)); + double abs_error = fabs(baseline_result-results_vec[i]); + double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0; + if(abs_error > 1e-5 && rel_error > 1e-5) + cout << std::scientific << baseline_result << " "< #include +//#define DEBUG +#define MUSTAFA +#define KARTHIK + /* template string getBinaryStr (T val, int numBitsToWrite) { @@ -17,8 +21,10 @@ string getBinaryStr (T val, int numBitsToWrite) { return oss.str() ; } */ +#ifdef MUSTAFA -void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + +void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { const int maskBitCnt = MAIN_TYPE_SIZE ; @@ -34,14 +40,14 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i int mOffset = (col-1) % maskBitCnt ; MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; - char hapChar = tc.hap[col-1] ; + char hapChar = ConvertChar::get(tc.hap[col-1]); if (hapChar == AMBIG_CHAR) { for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) maskArr[mIndex][ci] |= bitMask ; } - maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + maskArr[mIndex][hapChar] |= bitMask ; // bit corresponding to col 1 will be the MSB of the mask 0 // bit corresponding to col 2 will be the MSB-1 of the mask 0 // ... @@ -52,7 +58,7 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i } -void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { +void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { for (int ri=0; ri < numRowsToProcess; ++ri) { rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; @@ -63,6 +69,7 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA } } + #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ @@ -71,33 +78,37 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA } -void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { +void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) { for (int ei=0; ei < AVX_LENGTH/2; ++ei) { - SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], + SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], lastMaskShiftOut[ei], ei, maskBitCnt) ; int ei2 = ei + AVX_LENGTH/2 ; // the second entry index - SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], + SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], lastMaskShiftOut[ei2], ei2, maskBitCnt) ; } } -//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { -inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { +//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { + +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { //#define computeDistVec() { - _256_TYPE maskV ; - VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; +#ifdef DEBUGG + long long *temp1 = (long long *)(&maskV); + double *temp2 = (double *)(&distm); + double *temp3 = (double *)(&_1_distm); + printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); +#endif - distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; + distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ; /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ - VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; - VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; + bitMaskVec.shift_left_1bit() ; } @@ -120,8 +131,9 @@ struct HmmData { } ; */ +#endif // MUSTAFA -template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +template void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) { NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); @@ -157,18 +169,14 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; - *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; - *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; -#ifdef DEBUG3 - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true); - debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true); -#endif - //*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; - //*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + #ifdef KARTHIK + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; + #else + *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + #endif + } NUMBER *ptr_distm1D = (NUMBER *)distm1D; @@ -176,14 +184,11 @@ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS { int _q = tc->q[r-1] & 127; ptr_distm1D[r-1] = ctx.ph2pr[_q]; -#ifdef DEBUG3 - debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true); -#endif } } -template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( +template inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)( int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, @@ -200,19 +205,18 @@ template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION) NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); -#define TRISTATE_CORRECTION_FACTOR 3.0 - UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR); - + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0); /* compare rs and N */ - //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); - //rsN.d = VEC_CMP_EQ(N_packed256, rs); - +#ifndef MUSTAFA + rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); + rsN.d = VEC_CMP_EQ(N_packed256, rs); +#endif distm = distm1D[i]; - _1_distm = VEC_SUB(packed1.d, distm); -#ifndef DO_NOT_USE_TRISTATE_CORRECTION - distm = VEC_DIV(distm, packed3.d); -#endif - + _1_distm = VEC_SUB(packed1.d, distm); + + #ifdef KARTHIK + distm = VEC_DIV(distm, packed3.d); + #endif /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ M_t_2.d = VEC_SET1_VAL(zero); X_t_2.d = VEC_SET1_VAL(zero); @@ -234,7 +238,7 @@ template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION) -inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, +inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, _256_TYPE distm, _256_TYPE _1_distm) { UNION_TYPE hapN, rshap; @@ -263,12 +267,21 @@ inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcas } -inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel) { /* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */ M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel); + +#ifdef DEBUG + double *temp1 = (double *)(&pGAPM); + double *temp2 = (double *)(&pMM); + double *temp3 = (double *)(&distmSel); + printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]); + //printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]); + printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]); +#endif M_t_y = M_t; /* Compute X_t */ @@ -278,7 +291,7 @@ inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_ Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY)); } -template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) { _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; @@ -306,105 +319,114 @@ template NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t int remainingRows = (ROWS-1) % AVX_LENGTH; int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); +#ifdef MUSTAFA const int maskBitCnt = MAIN_TYPE_SIZE ; const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; - GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; char rsArr[AVX_LENGTH] ; MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; - - GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, +#endif + GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); //for (int __ii=0; __ii < 10; ++__ii) for (int i=0;i -#include -#include -#include +#include "headers.h" #include "template.h" +#include "vector_defs.h" -#include "define-float.h" -#include "shift_template.c" -#include "pairhmm-template-kernel.cc" +#define SIMD_TYPE avx +#define SIMD_TYPE_AVX -#include "define-double.h" -#include "shift_template.c" -#include "pairhmm-template-kernel.cc" #define BATCH_SIZE 10000 -//#define RUN_HYBRID +#define RUN_HYBRID -//uint8_t ConvertChar::conversionTable[255] ; +double getCurrClk(); int thread_level_parallelism_enabled = false ; -double getCurrClk() { - struct timeval tv ; - gettimeofday(&tv, NULL); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +void print128b_F(__m128 x) +{ + float *p = (float *)(&x); + for (int i=3;i>=0;i--) + printf("%f ", *(p+i)); + printf("\n"); } +void print128b_D(__m128d x) +{ + double *p = (double *)(&x); + for (int i=1;i>=0;i--) + printf("%f ", *(p+i)); + printf("\n"); +} int main() { - - testcase tc[BATCH_SIZE]; +/* + IF_128f x; + x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0); + IF_32 shiftIn, shiftOut; + shiftIn.f = 5.0f; + print128b_F(x.f); + GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut); + print128b_F(x.f); + + IF_128d y; + y.f = _mm_set_pd(10.0, 11.0); + IF_64 shiftInd, shiftOutd; + shiftInd.f = 12.0; + print128b_D(y.f); + GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd); + print128b_D(y.f); + + exit(0); +*/ + + testcase* tc = new testcase[BATCH_SIZE]; float result[BATCH_SIZE], result_avxf; double result_avxd; //struct timeval start, end; @@ -40,12 +62,12 @@ int main() // Need to call it once to initialize the static array ConvertChar::init() ; + - - char* ompEnvVar = getenv("OMP_NUM_THREADS") ; - if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { - thread_level_parallelism_enabled = true ; - } +// char* ompEnvVar = getenv("OMP_NUM_THREADS") ; +// if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { +// thread_level_parallelism_enabled = true ; +// } bool noMoreData = false; int count =0; @@ -55,7 +77,7 @@ int main() lastClk = getCurrClk() ; for (int b=0;b(&tc[b]); + result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)(&tc[b]); #ifdef RUN_HYBRID #define MIN_ACCEPTED 1e-28f if (result_avxf < MIN_ACCEPTED) { + //printf("**************** RUNNING DOUBLE ******************\n"); count++; - result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc[b]); + result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)(&tc[b]); result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); } else @@ -95,14 +118,15 @@ int main() //gettimeofday(&start, NULL); lastClk = getCurrClk() ; - for (int b=0;b(testcase* tc, double* nextlog); +template float compute_full_prob_sses(testcase* tc, float* nextlog); diff --git a/PairHMM_JNI/template.h b/PairHMM_JNI/template.h index e70784073..ba6370de8 100644 --- a/PairHMM_JNI/template.h +++ b/PairHMM_JNI/template.h @@ -1,24 +1,14 @@ #ifndef TEMPLATES_H_ #define TEMPLATES_H_ -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include +#include "headers.h" +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 #define MROWS 500 #define MCOLS 1000 @@ -35,12 +25,25 @@ typedef union __attribute__((aligned(32))) { ALIGNED __m256i ALIGNED i; } ALIGNED mix_F ALIGNED; +typedef union __attribute__((aligned(32))) { + ALIGNED __m128 ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED float ALIGNED f[4]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_F128 ALIGNED; + typedef union ALIGNED { __m128i vec ; __m128 vecf ; uint32_t masks[4] ; } MaskVec_F ; +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint32_t masks[2] ; +} MaskVec_F128 ; + typedef union ALIGNED { ALIGNED __m128i ALIGNED i; @@ -60,12 +63,25 @@ typedef union __attribute__((aligned(32))) { ALIGNED __m256i ALIGNED i; } ALIGNED mix_D ALIGNED; +typedef union __attribute__((aligned(32))) { + ALIGNED __m128d ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED double ALIGNED f[2]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_D128 ALIGNED; + typedef union ALIGNED { __m128i vec ; __m128d vecf ; uint64_t masks[2] ; } MaskVec_D ; +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint64_t masks[1] ; +} MaskVec_D128 ; + typedef union ALIGNED { ALIGNED __m128i ALIGNED i; @@ -134,30 +150,18 @@ typedef struct { int rslen, haplen; /*int *q, *i, *d, *c;*/ - int q[MROWS], i[MROWS], d[MROWS], c[MROWS]; + /*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/ + char *q, *i, *d, *c; char *hap, *rs; int *ihap; int *irs; } testcase; - -template -std::string to_string(T obj) -{ - std::stringstream ss; - std::string ret_string; - ss.clear(); - ss << std::scientific << obj; - ss >> ret_string; - ss.clear(); - return ret_string; -} -void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); - int normalize(char c); -int read_testcase(testcase *tc, FILE* ifp); -int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); +int read_testcase(testcase *tc, FILE* ifp=0); + +#define MIN_ACCEPTED 1e-28f #define NUM_DISTINCT_CHARS 5 #define AMBIG_CHAR 4 @@ -182,7 +186,7 @@ public: return conversionTable[input] ; } -} ; +}; #endif diff --git a/PairHMM_JNI/convert_char.cc b/PairHMM_JNI/utils.cc similarity index 62% rename from PairHMM_JNI/convert_char.cc rename to PairHMM_JNI/utils.cc index 01c5a5137..da6509763 100644 --- a/PairHMM_JNI/convert_char.cc +++ b/PairHMM_JNI/utils.cc @@ -1,7 +1,62 @@ +#include "headers.h" #include "template.h" +#include "utils.h" +#include "vector_defs.h" + uint8_t ConvertChar::conversionTable[255]; +float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0; +double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0; + using namespace std; +bool is_avx_supported() +{ + int ecx = 0, edx = 0, ebx = 0; + __asm__("cpuid" + : "=b" (ebx), + "=c" (ecx), + "=d" (edx) + : "a" (1) + ); + return ((ecx >> 28)&1) == 1; +} + +bool is_sse42_supported() +{ + int ecx = 0, edx = 0, ebx = 0; + __asm__("cpuid" + : "=b" (ebx), + "=c" (ecx), + "=d" (edx) + : "a" (1) + ); + return ((ecx >> 20)&1) == 1; +} + +void initialize_function_pointers() +{ + //if(false) + if(is_avx_supported()) + { + cout << "Using AVX accelerated implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob_avxs; + g_compute_full_prob_double = compute_full_prob_avxd; + } + else + if(is_sse42_supported()) + { + cout << "Using SSE4.2 accelerated implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob_sses; + g_compute_full_prob_double = compute_full_prob_ssed; + } + else + { + cout << "Using un-vectorized C++ implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob; + g_compute_full_prob_double = compute_full_prob; + } +} + int normalize(char c) { return ((int) (c - 33)); @@ -35,10 +90,10 @@ int read_testcase(testcase *tc, FILE* ifp) tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); tc->irs = (int *) malloc(tc->rslen*sizeof(int)); - //tc->q = (int *) malloc(sizeof(int) * tc->rslen); - //tc->i = (int *) malloc(sizeof(int) * tc->rslen); - //tc->d = (int *) malloc(sizeof(int) * tc->rslen); - //tc->c = (int *) malloc(sizeof(int) * tc->rslen); + tc->q = (char *) malloc(sizeof(char) * tc->rslen); + tc->i = (char *) malloc(sizeof(char) * tc->rslen); + tc->d = (char *) malloc(sizeof(char) * tc->rslen); + tc->c = (char *) malloc(sizeof(char) * tc->rslen); for (x = 0; x < tc->rslen; x++) { @@ -144,18 +199,22 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat) memcpy(tc->hap, tokens[0].c_str(), tokens[0].size()); tc->rs = new char[tokens[1].size()+2]; tc->rslen = tokens[1].size(); + tc->q = new char[tc->rslen]; + tc->i = new char[tc->rslen]; + tc->d = new char[tc->rslen]; + tc->c = new char[tc->rslen]; //cout << "Lengths "<haplen <<" "<rslen<<"\n"; memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); assert(tokens.size() == 2 + 4*(tc->rslen)); assert(tc->rslen < MROWS); for(unsigned j=0;jrslen;++j) - tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]); + tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]); for(unsigned j=0;jrslen;++j) - tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]); + tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]); for(unsigned j=0;jrslen;++j) - tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]); + tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]); for(unsigned j=0;jrslen;++j) - tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]); + tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]); if(reformat) { @@ -184,3 +243,20 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat) return tokens.size(); } + +double getCurrClk() { + struct timeval tv ; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + +uint64_t get_time(struct timespec* store_struct) +{ + static struct timespec start_time; + struct timespec curr_time; + struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct; + clock_gettime(CLOCK_REALTIME, ptr); + uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec); + start_time = *ptr; + return diff_time; +} diff --git a/PairHMM_JNI/utils.h b/PairHMM_JNI/utils.h new file mode 100644 index 000000000..cc020ba40 --- /dev/null +++ b/PairHMM_JNI/utils.h @@ -0,0 +1,31 @@ +#ifndef PAIRHMM_UTIL_H +#define PAIRHMM_UTIL_H + +#include "template.h" + +template +std::string to_string(T obj) +{ + std::stringstream ss; + std::string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + +int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); + +bool is_avx_supported(); +bool is_sse42_supported(); +extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log); +extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log); +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline); +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0); +void initialize_function_pointers(); +double getCurrClk(); +uint64_t get_time(struct timespec* x=0); +#endif diff --git a/PairHMM_JNI/vector_defs.h b/PairHMM_JNI/vector_defs.h new file mode 100644 index 000000000..7958480f2 --- /dev/null +++ b/PairHMM_JNI/vector_defs.h @@ -0,0 +1,30 @@ +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX +#undef SIMD_TYPE_SSE + +#define SIMD_TYPE avx +#define SIMD_TYPE_AVX + +#include "define-float.h" +#include "vector_function_prototypes.h" + +#include "define-double.h" +#include "vector_function_prototypes.h" + +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX + +#define SIMD_TYPE sse +#define SIMD_TYPE_SSE + + +#include "define-sse-float.h" +#include "vector_function_prototypes.h" + +#include "define-sse-double.h" +#include "vector_function_prototypes.h" + +#undef SIMD_TYPE +#undef SIMD_TYPE_AVX +#undef SIMD_TYPE_SSE + diff --git a/PairHMM_JNI/vector_function_prototypes.h b/PairHMM_JNI/vector_function_prototypes.h new file mode 100644 index 000000000..ce9cc2abc --- /dev/null +++ b/PairHMM_JNI/vector_function_prototypes.h @@ -0,0 +1,19 @@ +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); +inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); +inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); +inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); +inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen); +template inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D); +template inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)( + int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, + _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , + _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM); +inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, + _256_TYPE distm, _256_TYPE _1_distm); +inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, + UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, + _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel); +template NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL); + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 82015d153..b785b8d21 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -1086,7 +1086,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO (open & close) at the walker, not the engine. - //likelihoodCalculationEngine.close(); + likelihoodCalculationEngine.close(); logger.info("Ran local assembly on " + result + " active regions"); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 0626f2268..04e64186f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -89,4 +89,6 @@ public interface LikelihoodCalculationEngine { */ public Map computeReadLikelihoods(AssemblyResultSet assemblyResultSet, Map> perSampleReadList); + + public void close(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 0a82e1997..b91805a62 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -165,8 +165,10 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation } } + @Override public void close() { if ( likelihoodsStream != null ) likelihoodsStream.close(); + pairHMMThreadLocal.get().close(); } private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ @@ -330,7 +332,12 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation } // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + pairHMMThreadLocal.get().initialize(haplotypes, perSampleReadList, X_METRIC_LENGTH, Y_METRIC_LENGTH); + } + + private void finalizePairHMM() + { + pairHMMThreadLocal.get().finalizeRegion(); } @@ -350,6 +357,9 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); stratifiedReadMap.put(sampleEntry.getKey(), map); } + + //Used mostly by the JNI implementation(s) to free arrays + finalizePairHMM(); return stratifiedReadMap; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java index 58b8fc591..6cbd3510c 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java @@ -57,6 +57,7 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.List; import java.util.Map; +import java.util.HashMap; /** @@ -67,12 +68,13 @@ import java.util.Map; public class JNILoglessPairHMM extends LoglessPairHMM { private static final boolean debug = false; //simulates ifdef + private static final boolean verify = debug || false; //simulates ifdef private static final boolean debug0_1 = false; //simulates ifdef private static final boolean debug1 = false; //simulates ifdef private static final boolean debug2 = false; private static final boolean debug3 = false; private int numComputeLikelihoodCalls = 0; - + //Used to copy references to byteArrays to JNI from reads protected class JNIReadDataHolderClass { @@ -90,6 +92,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } private native void jniGlobalInit(Class readDataHolderClass, Class haplotypeDataHolderClass); + private native void jniClose(); private static boolean isJNILoglessPairHMMLibraryLoaded = false; @@ -104,6 +107,13 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } } + @Override + public void close() + { + jniClose(); + debugClose(); + } + //Used to test parts of the compute kernel separately private native void jniInitialize(final int readMaxLength, final int haplotypeMaxLength); private native static void jniInitializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP); @@ -126,7 +136,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM { @Override public void initialize(final int readMaxLength, final int haplotypeMaxLength) { - if(debug) + if(verify) super.initialize(readMaxLength, haplotypeMaxLength); if(debug3) { @@ -138,6 +148,42 @@ public class JNILoglessPairHMM extends LoglessPairHMM { jniInitialize(readMaxLength, haplotypeMaxLength); } + private HashMap haplotypeToHaplotypeListIdxMap = new HashMap(); + private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); + //Used to transfer data to JNI + //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region + /** + * {@inheritDoc} + */ + @Override + public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { + if(verify) + super.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength); + int numHaplotypes = haplotypes.size(); + JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes]; + int idx = 0; + for(final Haplotype currHaplotype : haplotypes) + { + haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass(); + haplotypeDataArray[idx].haplotypeBases = currHaplotype.getBases(); + haplotypeToHaplotypeListIdxMap.put(currHaplotype, idx); + ++idx; + } + jniInitializeHaplotypes(numHaplotypes, haplotypeDataArray); + } + + private native void jniFinalizeRegion(); + //Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not + //accessing Java memory directly, still important to release memory from C++ + /** + * {@inheritDoc} + */ + @Override + public void finalizeRegion() + { + jniFinalizeRegion(); + } + //Real compute kernel private native void jniComputeLikelihoods(int numReads, int numHaplotypes, JNIReadDataHolderClass[] readDataArray, JNIHaplotypeDataHolderClass[] haplotypeDataArray, @@ -149,19 +195,20 @@ public class JNILoglessPairHMM extends LoglessPairHMM { public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { // (re)initialize the pairHMM only if necessary - final int readMaxLength = debug ? findMaxReadLength(reads) : 0; - final int haplotypeMaxLength = debug ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; - if(debug) + final int readMaxLength = verify ? findMaxReadLength(reads) : 0; + final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; + if(verify) { if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } if ( ! initialized ) - throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug mode"); + throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); } int readListSize = reads.size(); - int alleleHaplotypeMapSize = alleleHaplotypeMap.size(); + int numHaplotypes = alleleHaplotypeMap.size(); + int numTestcases = readListSize*numHaplotypes; if(debug0_1) - System.out.println("Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize); + System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; int idx = 0; for(GATKSAMRecord read : reads) @@ -188,39 +235,68 @@ public class JNILoglessPairHMM extends LoglessPairHMM { } ++idx; } - JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[alleleHaplotypeMapSize]; - idx = 0; - for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + + JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes]; + if(verify) { - haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass(); - haplotypeDataArray[idx].haplotypeBases = currEntry.getValue().getBases(); - if(debug0_1) - System.out.println("Java haplotype length "+haplotypeDataArray[idx].haplotypeBases.length); - if(debug3) + idx = 0; + for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always { - for(int i=0;i currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always { - likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]); + //Since the order of haplotypes in the List and alleleHaplotypeMap is different, + //get idx of current haplotype in the list and use this idx to get the right likelihoodValue + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[readIdx + idxInsideHaplotypeList]); ++idx; } - if(debug) + readIdx += numHaplotypes; + } + if(verify) { + //re-order values in likelihoodArray + double[] tmpArray = new double[numHaplotypes]; + idx = 0; + idxInsideHaplotypeList = 0; + readIdx = 0; + for(GATKSAMRecord read : reads) + { + for(int j=0;j currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always + { + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList]; + ++idx; + } + readIdx += numHaplotypes; + } //to compare values likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); //for floating point values, no exact equality @@ -246,23 +322,23 @@ public class JNILoglessPairHMM extends LoglessPairHMM { if(toDump) { idx = 0; - System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize); + System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); for(int i=0;i haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { + initialize(readMaxLength, haplotypeMaxLength); + } + protected int findMaxReadLength(final List reads) { int listMaxReadLength = 0; for(GATKSAMRecord read : reads){ @@ -280,4 +300,7 @@ public abstract class PairHMM { return Math.min(haplotype1.length, haplotype2.length); } + + //Called at the end of all HC calls + public void close() { ; } }