diff --git a/PairHMM_JNI/.gitignore b/PairHMM_JNI/.gitignore new file mode 100644 index 000000000..cb70b53a7 --- /dev/null +++ b/PairHMM_JNI/.gitignore @@ -0,0 +1,10 @@ +.svn +*.o +*.so +tests +.deps +hmm_Mohammad +pairhmm-template-main +*.swp +checker +reformat diff --git a/PairHMM_JNI/Makefile b/PairHMM_JNI/Makefile new file mode 100644 index 000000000..70bf96dd3 --- /dev/null +++ b/PairHMM_JNI/Makefile @@ -0,0 +1,47 @@ +OMPCFLAGS=-fopenmp +#OMPLDFLAGS=-lgomp + +#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas +#CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas + +JAVA_ROOT=/opt/jdk1.7.0_25/ +JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux + +CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX + +CXXFLAGS=$(CFLAGS) +CC=icc +CXX=icc + +LDFLAGS=-lm $(OMPLDFLAGS) + +#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev +BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker + +#SOURCES=pairhmm-1-base.cc input.cc +LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc +SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc +LIBOBJECTS=$(LIBSOURCES:.cc=.o) +DEPDIR=.deps +DF=$(DEPDIR)/$(*).d + +all: $(BIN) + +-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) + +checker: pairhmm-1-base.o convert_char.o + $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) + +pairhmm-template-main: pairhmm-template-main.o convert_char.o + $(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS) + +libJNILoglessPairHMM.so: $(LIBOBJECTS) + $(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) + +%.o: %.cc + @mkdir -p $(DEPDIR) + $(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $< + + +clean: + rm -rf $(BIN) *.o $(DEPDIR) diff --git a/PairHMM_JNI/convert_char.cc b/PairHMM_JNI/convert_char.cc new file mode 100644 index 000000000..01c5a5137 --- /dev/null +++ b/PairHMM_JNI/convert_char.cc @@ -0,0 +1,186 @@ +#include "template.h" +uint8_t ConvertChar::conversionTable[255]; +using namespace std; + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); + if (read == -1) + return -1; + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + assert(tc->rslen < MROWS); + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + //tc->q = (int *) malloc(sizeof(int) * tc->rslen); + //tc->i = (int *) malloc(sizeof(int) * tc->rslen); + //tc->d = (int *) malloc(sizeof(int) * tc->rslen); + //tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; + } + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + +unsigned MAX_LINE_LENGTH = 65536; +int convToInt(std::string s) +{ + int i; + std::istringstream strin(s); + strin >> i; + return i; +} + +void tokenize(std::ifstream& fptr, std::vector& tokens) +{ + int i = 0; + std::string tmp; + std::vector myVec; + vector line; + line.clear(); + line.resize(MAX_LINE_LENGTH); + vector tmpline; + tmpline.clear(); + tmpline.resize(MAX_LINE_LENGTH); + myVec.clear(); + + while(!fptr.eof()) + { + i = 0; + bool still_read_line = true; + unsigned line_position = 0; + while(still_read_line) + { + fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH); + if(line_position + MAX_LINE_LENGTH > line.size()) + line.resize(2*line.size()); + for(unsigned i=0;i> std::skipws >> tmp; + if(tmp != "") + { + myVec.push_back(tmp); + ++i; + //std::cout < 0) + break; + } + tokens.clear(); + //std::cout << "Why "< tokens; + tokens.clear(); + tokenize(fptr, tokens); + if(tokens.size() == 0) + return -1; + tc->hap = new char[tokens[0].size()+2]; + tc->haplen = tokens[0].size(); + memcpy(tc->hap, tokens[0].c_str(), tokens[0].size()); + tc->rs = new char[tokens[1].size()+2]; + tc->rslen = tokens[1].size(); + //cout << "Lengths "<haplen <<" "<rslen<<"\n"; + memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); + assert(tokens.size() == 2 + 4*(tc->rslen)); + assert(tc->rslen < MROWS); + for(unsigned j=0;jrslen;++j) + tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]); + + if(reformat) + { + ofstream ofptr; + ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app); + assert(ofptr.is_open()); + ofptr << tokens[0] << " "; + ofptr << tokens[1] << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->q[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->i[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->d[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->c[j]+33)); + ofptr << " 0 false\n"; + + ofptr.close(); + first_call = false; + } + + + return tokens.size(); +} diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h new file mode 100644 index 000000000..5d31741d6 --- /dev/null +++ b/PairHMM_JNI/define-double.h @@ -0,0 +1,158 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO + #undef VEC_OR + #undef VEC_ADD + #undef VEC_SUB + #undef VEC_MUL + #undef VEC_DIV + #undef VEC_BLEND + #undef VEC_BLENDV + #undef VEC_CAST_256_128 + #undef VEC_EXTRACT_128 + #undef VEC_EXTRACT_UNIT + #undef VEC_SET1_VAL128 + #undef VEC_MOVE + #undef VEC_CAST_128_256 + #undef VEC_INSERT_VAL + #undef VEC_CVT_128_256 + #undef VEC_SET1_VAL + #undef VEC_POPCVT_CHAR + #undef VEC_LDPOPCVT_CHAR + #undef VEC_CMP_EQ + #undef VEC_SET_LSE + #undef SHIFT_HAP + #undef print256b + #undef MASK_VEC + #undef VEC_SSE_TO_AVX + #undef VEC_SHIFT_LEFT_1BIT + #undef MASK_ALL_ONES + #undef COMPARE_VECS + #undef _256_INT_TYPE +#endif + +#define PRECISION d +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 8 +#define SHIFT_CONST2 1 +#define SHIFT_CONST3 8 +#define _128_TYPE __m128d +#define _256_TYPE __m256d +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 4 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF +#define MASK_VEC MaskVec_D + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_pd() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_pd(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm256_div_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_pd(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castpd256_pd128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_pd (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_pd(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_sd(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castpd128_pd256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_pd(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_pd(zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define print256b(__v1) \ + print256bDP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castpd128_pd256(__vsLow) ; \ + __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + double* ptr1 = (double*) (&__v1) ; \ + double* ptr2 = (double*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Double Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h new file mode 100644 index 000000000..0e2822b2d --- /dev/null +++ b/PairHMM_JNI/define-float.h @@ -0,0 +1,159 @@ +#include + +#ifdef PRECISION + #undef PRECISION + #undef MAIN_TYPE + #undef MAIN_TYPE_SIZE + #undef UNION_TYPE + #undef IF_128 + #undef IF_MAIN_TYPE + #undef SHIFT_CONST1 + #undef SHIFT_CONST2 + #undef SHIFT_CONST3 + #undef _128_TYPE + #undef _256_TYPE + #undef AVX_LENGTH + #undef MAVX_COUNT + #undef HAP_TYPE + #undef MASK_TYPE + #undef MASK_ALL_ONES + + #undef SET_VEC_ZERO(__vec) + #undef VEC_OR(__v1, __v2) + #undef VEC_ADD(__v1, __v2) + #undef VEC_SUB(__v1, __v2) + #undef VEC_MUL(__v1, __v2) + #undef VEC_DIV(__v1, __v2) + #undef VEC_BLEND(__v1, __v2, __mask) + #undef VEC_BLENDV(__v1, __v2, __maskV) + #undef VEC_CAST_256_128(__v1) + #undef VEC_EXTRACT_128(__v1, __im) + #undef VEC_EXTRACT_UNIT(__v1, __im) + #undef VEC_SET1_VAL128(__val) + #undef VEC_MOVE(__v1, __val) + #undef VEC_CAST_128_256(__v1) + #undef VEC_INSERT_VAL(__v1, __val, __pos) + #undef VEC_CVT_128_256(__v1) + #undef VEC_SET1_VAL(__val) + #undef VEC_POPCVT_CHAR(__ch) + #undef VEC_LDPOPCVT_CHAR(__addr) + #undef VEC_CMP_EQ(__v1, __v2) + #undef VEC_SET_LSE(__val) + #undef SHIFT_HAP(__v1, __val) + #undef print256b(__v1) + #undef MASK_VEC + #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) + #undef VEC_SHIFT_LEFT_1BIT(__vs) + #undef MASK_ALL_ONES + #undef COMPARE_VECS(__v1, __v2) + #undef _256_INT_TYPE +#endif + +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 12 +#define SHIFT_CONST2 3 +#define SHIFT_CONST3 4 +#define _128_TYPE __m128 +#define _256_TYPE __m256 +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 8 +#define MAVX_COUNT (MROWS+7)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm256_div_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castps256_ps128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_ps (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_ps(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_ss(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castps128_ps256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_ps(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lasts(__v1, __val.f); + +#define print256b(__v1) \ + print256bFP(__v1) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + float* ptr1 = (float*) (&__v1) ; \ + float* ptr2 = (float*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Float Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ + } + diff --git a/PairHMM_JNI/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc new file mode 100644 index 000000000..c335a0160 --- /dev/null +++ b/PairHMM_JNI/hmm_mask.cc @@ -0,0 +1,451 @@ +#include +#include +#include +#include + + +// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static + + +#include + +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include "template.h" + +using namespace std ; + +#define CHECK_MASK_CORRECTNESS + +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} + +int normalize(char c) +{ + return ((int) (c - 33)); +} + + +uint8_t ConvertChar::conversionTable[255] ; + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); + if (read == -1) + return -1; + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + assert(tc->rslen < MROWS); + tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + //tc->q = (int *) malloc(sizeof(int) * tc->rslen); + //tc->i = (int *) malloc(sizeof(int) * tc->rslen); + //tc->d = (int *) malloc(sizeof(int) * tc->rslen); + //tc->c = (int *) malloc(sizeof(int) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + tc->irs[x] = tc->rs[x]; + } + for (x = 0; x < tc->haplen; x++) + tc->ihap[x] = tc->hap[x]; + + + free(q); + free(i); + free(d); + free(c); + free(line); + + return 0; +} + + + + +#define ALIGN __attribute__ ((aligned(32))) + + +typedef union ALIGN { + //__m256i vi; + __m128 vf ; + __m128i vi ; + __m128d vd; + + uint8_t b[16]; + //uint16_t hw[8] ; + uint32_t w[4] ; + uint64_t dw[2] ; + + float f[4] ; + //double d[2] ; + +} v128 ; + +typedef union ALIGN { + __m256i vi; + __m256 vf ; + __m256d vd; + //__m128i vi128[2] ; + + uint8_t b[32]; + //uint16_t hw[16] ; + uint32_t w[8] ; + uint64_t dw[4] ; + + float f[8] ; + double d[4] ; + +} v256 ; + + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +#define VEC_ENTRY_CNT 8 +#define VEC_LEN 256 +#define VTYPE vf +#define VTYPEI vi +#define VENTRY f +#define VENTRYI w +#define MTYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF + + +#define VECTOR v256 +#define VECTOR_SSE v128 + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define SET_VEC_ONES(__vec) \ + __vec = _mm256_set1_epi32(0xFFFFFFFF) + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ + __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VECS_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + + +#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \ + uint64_t __mask = ; \ + uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \ + __shiftWord <<= (64-cnt) ; \ + __vs = _mm_slri_epi64(__vs, __cnt) ; \ + __vs.dw[0] |= __shiftWord ; \ + } + + +#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __mask >>= __shiftBy ; \ + __mask |= __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ + } + + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \ + MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs, + MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + maskArr[mIndex][0] |= bitMask ; + maskArr[mIndex][1] |= bitMask ; + maskArr[mIndex][2] |= bitMask ; + maskArr[mIndex][3] |= bitMask ; + } + + //cout << hapChar << " " << mIndex << " " << getBinaryStr(bitMask, 32) + // << endl ; + //cout << getBinaryStr(maskArr[0][hapChar],32) << endl ; + //exit(0) ; + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + + + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + + +void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) { + + int ROWS = tc.rslen + 1 ; + int COLS = tc.haplen + 1 ; + + // only for testing + VECTOR mismatchData, matchData ; + //SET_VEC_ZERO(mismatchData.VTYPE) ; + //SET_VEC_ONES(matchData.VTYPEI) ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) { + matchData.VENTRY[ei] = 1.0 ; + mismatchData.VENTRY[ei] = 0.0 ; + } + + const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ; + +#ifdef DEBUG + if (printDebug) { + cout << "The first 32 hap chars are: " ; + for (int i=0; i < 32; ++i) { + cout << tc.hap[i] ; + } + cout << endl ; + + cout << "Masks computed for A, C, T, G, N are: " << endl ; + cout << getBinaryStr(maskArr[0][0], 32) << endl ; + cout << getBinaryStr(maskArr[0][1], 32) << endl ; + cout << getBinaryStr(maskArr[0][2], 32) << endl ; + cout << getBinaryStr(maskArr[0][3], 32) << endl ; + cout << getBinaryStr(maskArr[0][4], 32) << endl ; + } +#endif // #ifdef DEBUG + + int beginRowIndex = 1 ; + while (beginRowIndex < ROWS) { + + int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ; + + char rsArr[VEC_ENTRY_CNT] ; + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors + VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3 + VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7 + + MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ; + for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) + lastMaskShiftOut[ei] = 0 ; + + int col = 1 ; + int diag = 1 ; + for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) { + // set up the mask vectors for the next maskBitCnt columns + + // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols) + for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The masks for entry 1: " << endl + << getBinaryStr(maskArr[0][rsArr[1]], 32) << endl + << getBinaryStr(currMaskVecLow.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + + // iterate over mask bit indices and columns + for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) { + + VECTOR maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ; + + VECTOR testData ; + testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE, + maskV.VTYPE) ; + + VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ; + VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ; + +#ifdef DEBUG + if (printDebug && maskIndex == 0) { + cout << "The mask for entry 1, mbi=" << mbi << ": " + << getBinaryStr(maskV.VENTRYI[1], 32) << endl ; + + } +#endif // #ifdef DEBUG + +#ifdef CHECK_MASK_CORRECTNESS + + int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ; + int lastRowIndex = min(col-1, numRowsToProcess-1) ; + + for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) { + int currRow = beginRowIndex + ri ; + int currCol = col - ri + firstRowIndex ; + + char hapChar = tc.hap[currCol-1] ; + char rsChar = tc.rs[currRow-1] ; + + bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ; + + if ((bool) testData.VENTRYI[ri] != match) { + cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag + << " (" << currRow << ", " << currCol << ")" << endl ; + + cout << "The chars are: " << hapChar << " and " << rsChar << endl ; + cout << "The selected value is: " << testData.VENTRYI[ri] << endl ; + + exit(0) ; + } + } +#endif // #ifdef CHECK_MASK_CORRECTNESS + + if (diag < COLS) + ++col ; + + } // mbi + } // maskIndex + + beginRowIndex += VEC_ENTRY_CNT ; + } // end of stripe + + //cout << "Finished validating entry " << endl ; +} + +#ifdef HMM_MASK_MAIN +int main () { + + #define BATCH_SIZE 10000 + + ConvertChar::init() ; + + testcase* tcBatch = new testcase[BATCH_SIZE] ; + + int numBatches = 0 ; + int numRead = 0 ; + + const int DEBUG_TC = -1 ; + + FILE* ifp = stdin ; + + do { + numRead = 0 ; + + while (numRead < BATCH_SIZE) { + if (read_testcase(tcBatch+numRead, ifp) < 0) + break ; + + ++numRead ; + } + + if (numRead == 0) + break ; + + for (int ti=0; ti < numRead; ++ti) { + + int tcID = numBatches * BATCH_SIZE + ti ; + test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ; + } + + ++numBatches ; + } while (numRead == BATCH_SIZE) ; + + fclose(ifp) ; + + delete[] tcBatch ; + + return 0 ; +} +#endif diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc new file mode 100644 index 000000000..852caa00d --- /dev/null +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc @@ -0,0 +1,349 @@ +#include +#include +#include +#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +using namespace std; +//#define DEBUG 1 +//#define DEBUG0_1 1 +//#define DEBUG3 1 + + +#include "template.h" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + + + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +class LoadTimeInitializer +{ + public: + LoadTimeInitializer() //will be called when library is loaded + { + ConvertChar::init(); + } + jfieldID m_readBasesFID; + jfieldID m_readQualsFID; + jfieldID m_insertionGOPFID; + jfieldID m_deletionGOPFID; + jfieldID m_overallGCPFID; + jfieldID m_haplotypeBasesFID; +}; +LoadTimeInitializer g_load_time_initializer; + +void debug_dump(string filename, string s, bool to_append, bool add_newline) +{ + ofstream fptr; + fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + fptr << s; + if(add_newline) + fptr << "\n"; + fptr.close(); +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities +(JNIEnv* env, jclass thisObject, + jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP + ) +{} + + JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize +(JNIEnv* env, jobject thisObject, + jint readMaxLength, jint haplotypeMaxLength) +{} + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells( + JNIEnv* env, jobject thisObject, + jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jint hapStartIndex + ) + +{ return 0.0; } + +#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1 + +#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY +//Gets direct access to Java arrays +#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical +#define JNI_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical + +#else +//Likely makes copy of Java arrays to JNI C++ space +#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements +#define JNI_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements + +#endif //ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( + JNIEnv* env, jobject thisObject, + jint readLength, jint haplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, + jint hapStartIndex + ) +{ + jboolean is_copy = JNI_FALSE; + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); + assert(readLength < MROWS); +#endif + testcase tc; + tc.rslen = readLength; + tc.haplen = haplotypeLength; + tc.rs = (char*)readBasesArray; + tc.hap = (char*)haplotypeBasesArray; + for(unsigned i=0;i(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); +#ifdef DEBUG + debug_dump("return_values_jni.txt",to_string(result),true); +#endif + + + RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE); + RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE); + + return 0.0; +} + +//Should be called only once for the whole Java process - initializes field ids for the classes JNIReadDataHolderClass +//and JNIHaplotypeDataHolderClass +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit + (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass) +{ + assert(readDataHolderClass); + assert(haplotypeDataHolderClass); + jfieldID fid; + fid = env->GetFieldID(readDataHolderClass, "readBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readBases"); + g_load_time_initializer.m_readBasesFID = fid; + fid = env->GetFieldID(readDataHolderClass, "readQuals", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readQuals"); + g_load_time_initializer.m_readQualsFID = fid; + fid = env->GetFieldID(readDataHolderClass, "insertionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for insertionGOP"); + g_load_time_initializer.m_insertionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "deletionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for deletionGOP"); + g_load_time_initializer.m_deletionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "overallGCP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for overallGCP"); + g_load_time_initializer.m_overallGCPFID = fid; + + fid = env->GetFieldID(haplotypeDataHolderClass, "haplotypeBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for haplotypeBases"); + g_load_time_initializer.m_haplotypeBasesFID = fid; +} + + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods + (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, + jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +{ + jboolean is_copy = JNI_FALSE; + //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector + vector > haplotypeBasesArrayVector; + haplotypeBasesArrayVector.clear(); + haplotypeBasesArrayVector.resize(numHaplotypes); +#ifdef DEBUG0_1 + cout << "JNI numReads "<GetObjectArrayElement(haplotypeDataArray, j); + jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID); +#ifdef DEBUG + assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str()); +#endif + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); +#ifdef DEBUG + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(env->GetArrayLength(haplotypeBases) < MCOLS); +#ifdef DEBUG0_1 + cout << "JNI haplotype length "<GetArrayLength(haplotypeBases)<<"\n"; +#endif +#endif + haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray); +#ifdef DEBUG3 + for(unsigned k=0;kGetArrayLength(haplotypeBases);++k) + debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); +#endif + } + + unsigned numTestCases = numReads*numHaplotypes; + vector tc_array; + tc_array.clear(); + tc_array.resize(numTestCases); + unsigned tc_idx = 0; + //Store arrays for release later + vector > > readBasesArrayVector; + readBasesArrayVector.clear(); + readBasesArrayVector.resize(numReads); + for(unsigned i=0;iGetObjectArrayElement(readDataArray, i); + jbyteArray readBases = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readBasesFID); + jbyteArray insertionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_insertionGOPFID); + jbyteArray deletionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_deletionGOPFID); + jbyteArray overallGCP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_overallGCPFID); + jbyteArray readQuals = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readQualsFID); + +#ifdef DEBUG + assert(readBases && ("readBases is NULL at index : "+to_string(i)+"\n").c_str()); + assert(insertionGOP && ("insertionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(deletionGOP && ("deletionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(overallGCP && ("overallGCP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(readQuals && ("readQuals is NULL at index : "+to_string(i)+"\n").c_str()); +#endif + jsize readLength = env->GetArrayLength(readBases); + + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); //order of GET-RELEASE is important + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "overallGCP array not initialized in JNI"); + assert(readLength < MROWS); + assert(readLength == env->GetArrayLength(readQuals)); + assert(readLength == env->GetArrayLength(insertionGOP)); + assert(readLength == env->GetArrayLength(deletionGOP)); + assert(readLength == env->GetArrayLength(overallGCP)); +#ifdef DEBUG0_1 + cout << "JNI read length "<GetArrayLength(haplotypeBasesArrayVector[j].first); + jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second; + tc_array[tc_idx].rslen = (int)readLength; + tc_array[tc_idx].haplen = (int)haplotypeLength; + tc_array[tc_idx].rs = (char*)readBasesArray; + tc_array[tc_idx].hap = (char*)haplotypeBasesArray; + //Can be avoided + for(unsigned k=0;kGetArrayLength(likelihoodArray) == numTestCases); +#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) + for(tc_idx=0;tc_idx(&(tc_array[tc_idx])); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); + likelihoodDoubleArray[tc_idx] = result; + } +#ifdef DEBUG + for(tc_idx=0;tc_idx=0;--i)//note the order - reverse of GET + { + for(int j=readBasesArrayVector.size()-1;j>=0;--j) + RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE); + readBasesArrayVector[i].clear(); + } + readBasesArrayVector.clear(); + + //Now release haplotype arrays + for(int j=numHaplotypes-1;j>=0;--j) //note the order - reverse of GET + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE); + haplotypeBasesArrayVector.clear(); + tc_array.clear(); +} + diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h new file mode 100644 index 000000000..1fff9fa5a --- /dev/null +++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h @@ -0,0 +1,76 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM */ + +#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#define _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM +#ifdef __cplusplus +extern "C" { +#endif +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION 3.0 +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch 0L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch 1L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion 2L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion 3L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L +#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion +#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitialize + * Signature: (II)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize + (JNIEnv *, jobject, jint, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializeProbabilities + * Signature: ([[D[B[B[B)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities + (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniInitializePriorsAndUpdateCells + * Signature: (ZII[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells + (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint); + + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10 + * Signature: (II[B[B[B[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 + (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniGlobalInit + * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit + (JNIEnv *, jobject, jclass, jclass); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM + * Method: jniComputeLikelihoods + * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods + (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/PairHMM_JNI/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc new file mode 100644 index 000000000..90f95aeec --- /dev/null +++ b/PairHMM_JNI/pairhmm-1-base.cc @@ -0,0 +1,238 @@ +//#define DEBUG 1 +//#define DEBUG0_1 1 +//#define DEBUG3 1 +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +#include +#include +#include +#include +#include +#include +#include "template.h" + +//#include "define-float.h" +//#include "shift_template.c" +//#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + + +using namespace std; +class LoadTimeInitializer +{ + public: + LoadTimeInitializer() //will be called when library is loaded + { + ConvertChar::init(); + } +}; +LoadTimeInitializer g_load_time_initializer; + + +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) +{ + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + + Context ctx; + + NUMBER M[MROWS][MCOLS]; + NUMBER X[MROWS][MCOLS]; + NUMBER Y[MROWS][MCOLS]; + NUMBER p[MROWS][6]; + + p[0][MM] = ctx._(0.0); + p[0][GapM] = ctx._(0.0); + p[0][MX] = ctx._(0.0); + p[0][XX] = ctx._(0.0); + p[0][MY] = ctx._(0.0); + p[0][YY] = ctx._(0.0); + for (r = 1; r < ROWS; r++) + { + int _i = tc->i[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; + p[r][MX] = ctx.ph2pr[_i]; + p[r][XX] = ctx.ph2pr[_c]; + p[r][MY] = ctx.ph2pr[_d]; + p[r][YY] = ctx.ph2pr[_c]; + //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + + NUMBER result = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + for (c = 1; c < COLS; c++) + { + char _rs = tc->rs[r-1]; + char _hap = tc->hap[c-1]; + int _q = tc->q[r-1] & 127; + NUMBER distm = ctx.ph2pr[_q]; + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; + else + distm = distm/3; + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; + } + + for (c = 0; c < COLS; c++) + { + result += M[ROWS-1][c] + X[ROWS-1][c]; + } + + if (before_last_log != NULL) + *before_last_log = result; + + return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +#define BATCH_SIZE 10000 +#define RUN_HYBRID + +int main(int argc, char** argv) +{ + if(argc < 2) + { + cerr << "Needs path to input file as argument\n"; + exit(0); + } + bool use_old_read_testcase = false; + if(argc >= 3 && string(argv[2]) == "1") + use_old_read_testcase = true; + + testcase tc; + if(use_old_read_testcase) + { + FILE* fptr = fopen(argv[1],"r"); + while(!feof(fptr)) + { + if(read_testcase(&tc, fptr) >= 0) + { + double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); + + cout << std::scientific << compute_full_prob(&tc) << " "< tokens; + ifptr.open(argv[1]); + assert(ifptr.is_open()); + while(1) + { + tokens.clear(); + if(read_mod_testcase(ifptr, &tc, false) < 0) + break; + //double result = 0; + double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc); + double result = log10(result_avxd) - log10(ldexp(1.0, 1020)); + + cout << std::scientific << compute_full_prob(&tc) << " "<(&tc[b]); + +#ifdef RUN_HYBRID +#define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = compute_full_prob(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif + +#ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif + + } + gettimeofday(&end, NULL); + aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + gettimeofday(&start, NULL); + for (int b=0;b +#include +#include + +/* +template +string getBinaryStr (T val, int numBitsToWrite) { + + ostringstream oss ; + uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; + for (int i=numBitsToWrite-1; i >= 0; --i) { + oss << ((val & mask) >> i) ; + mask >>= 1 ; + } + return oss.str() ; +} +*/ + +void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = MAIN_TYPE_SIZE ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = tc.hap[col-1] ; + + if (hapChar == AMBIG_CHAR) { + for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) + maskArr[mIndex][ci] |= bitMask ; + } + + maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + +void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { + + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + for (int ei=0; ei < AVX_LENGTH; ++ei) { + lastMaskShiftOut[ei] = 0 ; + } +} + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ + MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { + + for (int ei=0; ei < AVX_LENGTH/2; ++ei) { + SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index + SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +} + +//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { + +inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { + //#define computeDistVec() { + + _256_TYPE maskV ; + VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; + + distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; + + /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ + + VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; + VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; +} + + +/* +template +struct HmmData { + int ROWS ; + int COLS ; + + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ; + Context ctx ; + testcase* tc ; + _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ; + + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ; + UNION_TYPE rs , rsN ; + _256_TYPE distmSel; + _256_TYPE distm, _1_distm; + +} ; +*/ + +template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) +{ + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + for (int s=0;si[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + + *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; + *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; + *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; +#ifdef DEBUG3 + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true); + debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true); +#endif + //*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + + NUMBER *ptr_distm1D = (NUMBER *)distm1D; + for (int r = 1; r < ROWS; r++) + { + int _q = tc->q[r-1] & 127; + ptr_distm1D[r-1] = ctx.ph2pr[_q]; +#ifdef DEBUG3 + debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true); +#endif + } +} + + +template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( + int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, + _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , + _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) +{ + int i = stripIdx; + pGAPM = p_GAPM[i]; + pMM = p_MM[i]; + pMX = p_MX[i]; + pXX = p_XX[i]; + pMY = p_MY[i]; + pYY = p_YY[i]; + + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); +#define TRISTATE_CORRECTION_FACTOR 3.0 + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR); + + /* compare rs and N */ + //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); + //rsN.d = VEC_CMP_EQ(N_packed256, rs); + + distm = distm1D[i]; + _1_distm = VEC_SUB(packed1.d, distm); +#ifndef DO_NOT_USE_TRISTATE_CORRECTION + distm = VEC_DIV(distm, packed3.d); +#endif + + /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ + M_t_2.d = VEC_SET1_VAL(zero); + X_t_2.d = VEC_SET1_VAL(zero); + + if (i==0) { + M_t_1.d = VEC_SET1_VAL(zero); + X_t_1.d = VEC_SET1_VAL(zero); + Y_t_2.d = VEC_SET_LSE(init_Y); + Y_t_1.d = VEC_SET1_VAL(zero); + } + else { + X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); + M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); + Y_t_2.d = VEC_SET1_VAL(zero); + Y_t_1.d = VEC_SET1_VAL(zero); + } + M_t_1_y = M_t_1; +} + + + +inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, + _256_TYPE distm, _256_TYPE _1_distm) +{ + UNION_TYPE hapN, rshap; + _256_TYPE cond; + IF_32 shiftInHap; + + int *hap_ptr = tc->ihap; + + shiftInHap.i = (d NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +{ + _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; + _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; + _256_TYPE distm1D[MAVX_COUNT]; + NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH]; + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; + _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; + + struct timeval start, end; + NUMBER result_avx2; + Context ctx; + UNION_TYPE rs , rsN; + HAP_TYPE hap; + _256_TYPE distmSel, distmChosen ; + _256_TYPE distm, _1_distm; + + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + int AVX_COUNT = (ROWS+7)/8; + NUMBER zero = ctx._(0.0); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + _256_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + int remainingRows = (ROWS-1) % AVX_LENGTH; + int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); + + const int maskBitCnt = MAIN_TYPE_SIZE ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + + char rsArr[AVX_LENGTH] ; + MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; + + GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, + ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); + + //for (int __ii=0; __ii < 10; ++__ii) + for (int i=0;i +#include +#include +#include + +#include "template.h" + +#include "define-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#define BATCH_SIZE 10000 +//#define RUN_HYBRID + +//uint8_t ConvertChar::conversionTable[255] ; +int thread_level_parallelism_enabled = false ; + +double getCurrClk() { + struct timeval tv ; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + + +int main() +{ + + testcase tc[BATCH_SIZE]; + float result[BATCH_SIZE], result_avxf; + double result_avxd; + //struct timeval start, end; + double lastClk = 0.0 ; + double aggregateTimeRead = 0.0; + double aggregateTimeCompute = 0.0; + double aggregateTimeWrite = 0.0; + + // Need to call it once to initialize the static array + ConvertChar::init() ; + + + char* ompEnvVar = getenv("OMP_NUM_THREADS") ; + if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) { + thread_level_parallelism_enabled = true ; + } + + bool noMoreData = false; + int count =0; + while (!noMoreData) + { + int read_count = BATCH_SIZE; + + lastClk = getCurrClk() ; + for (int b=0;b(&tc[b]); + + #ifdef RUN_HYBRID + #define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + #ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); + #endif + + } + //gettimeofday(&end, NULL); + aggregateTimeCompute += (getCurrClk() - lastClk) ; + //((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)); + + //gettimeofday(&start, NULL); + lastClk = getCurrClk() ; + for (int b=0;b +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + + +#define MROWS 500 +#define MCOLS 1000 + +#define CAT(X,Y) X####Y +#define GEN_INTRINSIC(X,Y) CAT(X,Y) + +#define ALIGNED __attribute__((aligned(32))) + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256 ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED float ALIGNED f[8]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_F ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128 vecf ; + uint32_t masks[4] ; +} MaskVec_F ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128 ALIGNED f; +} ALIGNED IF_128f ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int ALIGNED i; + ALIGNED float ALIGNED f; +} ALIGNED IF_32 ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256d ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED double ALIGNED f[4]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_D ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128d vecf ; + uint64_t masks[2] ; +} MaskVec_D ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128d ALIGNED f; +} ALIGNED IF_128d ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int64_t ALIGNED i; + ALIGNED double ALIGNED f; +} ALIGNED IF_64 ALIGNED; + +template +struct Context{}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } + double ph2pr[128]; + double INITIAL_CONSTANT; + double LOG10_INITIAL_CONSTANT; + double RESULT_THRESHOLD; +}; + +template<> +struct Context +{ + Context() + { + for (int x = 0; x < 128; x++) + { + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + } + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } + float ph2pr[128]; + float INITIAL_CONSTANT; + float LOG10_INITIAL_CONSTANT; + float RESULT_THRESHOLD; +}; + + +typedef struct +{ + int rslen, haplen; + /*int *q, *i, *d, *c;*/ + int q[MROWS], i[MROWS], d[MROWS], c[MROWS]; + char *hap, *rs; + int *ihap; + int *irs; +} testcase; + + +template +std::string to_string(T obj) +{ + std::stringstream ss; + std::string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + +int normalize(char c); +int read_testcase(testcase *tc, FILE* ifp); +int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); + +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +class ConvertChar { + + static uint8_t conversionTable[255] ; + +public: + + static void init() { + assert (NUM_DISTINCT_CHARS == 5) ; + assert (AMBIG_CHAR == 4) ; + + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +} ; + + +#endif + +