Merge branch 'intel_pairhmm' of /home/karthikg/broad/gsa-unstable into intel_pairhmm

Committing into central_repo
This commit is contained in:
Intel Repocontact 2014-01-22 23:08:32 -08:00
commit f7fa79e561
32 changed files with 1538 additions and 1056 deletions

13
.gitignore vendored
View File

@ -24,3 +24,16 @@ dump/
lib/
out/
/atlassian-ide-plugin.xml
kg_tmp/
maven-ant-tasks-2.1.3.jar
null-sequenceGraph.25.0.0.raw_readthreading_graph.dot
null-sequenceGraph.25.0.1.cleaned_readthreading_graph.dot
null-sequenceGraph.25.0.1.initial_seqgraph.dot
null-sequenceGraph.3.0.0.raw_readthreading_graph.dot
null-sequenceGraph.3.0.1.cleaned_readthreading_graph.dot
null-sequenceGraph.3.0.1.initial_seqgraph.dot
org/
package-list
resources/
velocity.log

View File

@ -1,6 +1,6 @@
.svn
*.o
#*.so
*.so
tests
.deps
hmm_Mohammad
@ -8,3 +8,6 @@ pairhmm-template-main
*.swp
checker
reformat
subdir_checkout.sh
avx/
sse/

View File

@ -0,0 +1,76 @@
#include "LoadTimeInitializer.h"
#include "utils.h"
using namespace std;
LoadTimeInitializer g_load_time_initializer;
LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded
{
ConvertChar::init();
m_sumNumReads = 0;
m_sumSquareNumReads = 0;
m_sumNumHaplotypes = 0;
m_sumSquareNumHaplotypes = 0;
m_sumNumTestcases = 0;
m_sumSquareNumTestcases = 0;
m_maxNumTestcases = 0;
m_num_invocations = 0;
m_compute_time = 0;
m_data_transfer_time = 0;
m_filename_to_fptr.clear();
initialize_function_pointers();
cout.flush();
}
void LoadTimeInitializer::print_profiling()
{
double mean_val;
cout << "Compute time "<<m_compute_time*1e-9<<"\n";
cout << "Data initialization time "<<m_data_transfer_time*1e-9<<"\n";
cout <<"Invocations : "<<m_num_invocations<<"\n";
cout << "term\tsum\tsumSq\tmean\tvar\tmax\n";
mean_val = m_sumNumReads/m_num_invocations;
cout << "reads\t"<<m_sumNumReads<<"\t"<<m_sumSquareNumReads<<"\t"<<mean_val<<"\t"<<
(m_sumSquareNumReads/m_num_invocations)-mean_val*mean_val<<"\n";
mean_val = m_sumNumHaplotypes/m_num_invocations;
cout << "haplotypes\t"<<m_sumNumHaplotypes<<"\t"<<m_sumSquareNumHaplotypes<<"\t"<<mean_val<<"\t"<<
(m_sumSquareNumHaplotypes/m_num_invocations)-mean_val*mean_val<<"\n";
mean_val = m_sumNumTestcases/m_num_invocations;
cout << "numtestcases\t"<<m_sumNumTestcases<<"\t"<<m_sumSquareNumTestcases<<"\t"<<mean_val<<"\t"<<
(m_sumSquareNumTestcases/m_num_invocations)-mean_val*mean_val<<"\t"<<m_maxNumTestcases<<"\n";
cout.flush();
}
void LoadTimeInitializer::debug_dump(string filename, string s, bool to_append, bool add_newline)
{
map<string, ofstream*>::iterator mI = m_filename_to_fptr.find(filename);
ofstream* fptr = 0;
if(mI == m_filename_to_fptr.end())
{
m_filename_to_fptr[filename] = new ofstream();
fptr = m_filename_to_fptr[filename];
fptr->open(filename.c_str(), to_append ? ios::app : ios::out);
assert(fptr->is_open());
}
else
fptr = (*mI).second;
//ofstream fptr;
//fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
(*fptr) << s;
if(add_newline)
(*fptr) << "\n";
//fptr.close();
}
void LoadTimeInitializer::debug_close()
{
for(map<string,ofstream*>::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end();
mB != mE;mB++)
{
(*mB).second->close();
delete (*mB).second;
}
m_filename_to_fptr.clear();
}

View File

@ -0,0 +1,37 @@
#ifndef LOAD_TIME_INITIALIZER_H
#define LOAD_TIME_INITIALIZER_H
#include "headers.h"
#include <jni.h>
class LoadTimeInitializer
{
public:
LoadTimeInitializer(); //will be called when library is loaded
void print_profiling();
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
void debug_close();
jfieldID m_readBasesFID;
jfieldID m_readQualsFID;
jfieldID m_insertionGOPFID;
jfieldID m_deletionGOPFID;
jfieldID m_overallGCPFID;
jfieldID m_haplotypeBasesFID;
//used to compute avg, variance of #testcases
double m_sumNumReads;
double m_sumSquareNumReads;
double m_sumNumHaplotypes;
double m_sumSquareNumHaplotypes;
double m_sumNumTestcases;
double m_sumSquareNumTestcases;
unsigned m_maxNumTestcases;
unsigned m_num_invocations;
//timing
uint64_t m_compute_time;
uint64_t m_data_transfer_time;
private:
std::map<std::string, std::ofstream*> m_filename_to_fptr;
};
extern LoadTimeInitializer g_load_time_initializer;
#endif

View File

@ -1,5 +1,5 @@
OMPCFLAGS=-fopenmp
#OMPLDFLAGS=-lgomp
#OMPCFLAGS=-fopenmp
#OMPLFLAGS=-fopenmp #-openmp-link static
#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
#CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
@ -7,40 +7,59 @@ OMPCFLAGS=-fopenmp
JAVA_ROOT=/opt/jdk1.7.0_25/
JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux
CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
CXXFLAGS=$(CFLAGS)
COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
CC=icc
CXX=icc
LDFLAGS=-lm $(OMPLDFLAGS)
LDFLAGS=-lm -lrt $(OMPLDFLAGS)
#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker
BIN=libJNILoglessPairHMM.so pairhmm-template-main checker
#BIN=checker
#SOURCES=pairhmm-1-base.cc input.cc
LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc
SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
LIBOBJECTS=$(LIBSOURCES:.cc=.o)
DEPDIR=.deps
DF=$(DEPDIR)/$(*).d
#Common across libJNI and sandbox
COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc sse_function_instantiations.cc LoadTimeInitializer.cc
#Part of libJNI
LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc $(COMMON_SOURCES)
SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
LIBOBJECTS=$(LIBSOURCES:.cc=.o)
COMMON_OBJECTS=$(COMMON_SOURCES:.cc=.o)
#No vectorization for these files
NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc pairhmm-template-main.cc pairhmm-1-base.cc utils.cc baseline.cc LoadTimeInitializer.cc
#Use -xAVX for these files
AVX_SOURCES=avx_function_instantiations.cc
#Use -xSSE4.2 for these files
SSE_SOURCES=sse_function_instantiations.cc
NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)
all: $(BIN)
-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
checker: pairhmm-1-base.o convert_char.o
$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
checker: pairhmm-1-base.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
pairhmm-template-main: pairhmm-template-main.o convert_char.o
$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
libJNILoglessPairHMM.so: $(LIBOBJECTS)
$(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS)
$(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS}
%.o: %.cc
$(OBJECTS): %.o: %.cc
@mkdir -p $(DEPDIR)
$(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $<
$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
clean:

View File

@ -0,0 +1,19 @@
#include "template.h"
#undef SIMD_TYPE
#undef SIMD_TYPE_SSE
#define SIMD_TYPE avx
#define SIMD_TYPE_AVX
#include "define-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#include "define-double.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
template double compute_full_prob_avxd<double>(testcase* tc, double* nextlog);
template float compute_full_prob_avxs<float>(testcase* tc, float* nextlog);

View File

@ -0,0 +1,85 @@
#include "headers.h"
#include "template.h"
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
{
int r, c;
int ROWS = tc->rslen + 1;
int COLS = tc->haplen + 1;
Context<NUMBER> ctx;
NUMBER M[MROWS][MCOLS];
NUMBER X[MROWS][MCOLS];
NUMBER Y[MROWS][MCOLS];
NUMBER p[MROWS][6];
p[0][MM] = ctx._(0.0);
p[0][GapM] = ctx._(0.0);
p[0][MX] = ctx._(0.0);
p[0][XX] = ctx._(0.0);
p[0][MY] = ctx._(0.0);
p[0][YY] = ctx._(0.0);
for (r = 1; r < ROWS; r++)
{
int _i = tc->i[r-1] & 127;
int _d = tc->d[r-1] & 127;
int _c = tc->c[r-1] & 127;
p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
p[r][MX] = ctx.ph2pr[_i];
p[r][XX] = ctx.ph2pr[_c];
p[r][MY] = ctx.ph2pr[_d];
p[r][YY] = ctx.ph2pr[_c];
//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
}
for (c = 0; c < COLS; c++)
{
M[0][c] = ctx._(0.0);
X[0][c] = ctx._(0.0);
Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
}
for (r = 1; r < ROWS; r++)
{
M[r][0] = ctx._(0.0);
X[r][0] = X[r-1][0] * p[r][XX];
Y[r][0] = ctx._(0.0);
}
NUMBER result = ctx._(0.0);
for (r = 1; r < ROWS; r++)
for (c = 1; c < COLS; c++)
{
char _rs = tc->rs[r-1];
char _hap = tc->hap[c-1];
int _q = tc->q[r-1] & 127;
NUMBER distm = ctx.ph2pr[_q];
if (_rs == _hap || _rs == 'N' || _hap == 'N')
distm = ctx._(1.0) - distm;
else
distm = distm/3;
M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
}
for (c = 0; c < COLS; c++)
{
result += M[ROWS-1][c] + X[ROWS-1][c];
}
if (before_last_log != NULL)
*before_last_log = result;
return result;
//return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
}
template double compute_full_prob<double>(testcase* tc, double* nextbuf);
template float compute_full_prob<float>(testcase* tc, float* nextbuf);

View File

@ -18,35 +18,36 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef print256b
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef print256b(__v1)
#undef MASK_VEC
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef MASK_ALL_ONES
#undef COMPARE_VECS
#undef COMPARE_VECS(__v1, __v2)
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif
#define PRECISION d
@ -83,7 +84,7 @@
#define VEC_MUL(__v1, __v2) \
_mm256_mul_pd(__v1, __v2)
#define VEC_DIV(__v1, __v2) \
#define VEC_DIV(__v1, __v2) \
_mm256_div_pd(__v1, __v2)
#define VEC_BLEND(__v1, __v2, __mask) \
@ -156,3 +157,32 @@
} \
} \
}
class BitMaskVec_double {
MASK_VEC low_, high_ ;
_256_TYPE combined_ ;
public:
inline MASK_TYPE& getLowEntry(int index) {
return low_.masks[index] ;
}
inline MASK_TYPE& getHighEntry(int index) {
return high_.masks[index] ;
}
inline const _256_TYPE& getCombinedMask() {
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
return combined_ ;
}
inline void shift_left_1bit() {
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
}
} ;
#define BITMASK_VEC BitMaskVec_double

View File

@ -47,6 +47,7 @@
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif
#define PRECISION s
@ -84,7 +85,7 @@
#define VEC_MUL(__v1, __v2) \
_mm256_mul_ps(__v1, __v2)
#define VEC_DIV(__v1, __v2) \
#define VEC_DIV(__v1, __v2) \
_mm256_div_ps(__v1, __v2)
#define VEC_BLEND(__v1, __v2, __mask) \
@ -133,7 +134,7 @@
_mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
#define SHIFT_HAP(__v1, __val) \
_vector_shift_lasts(__v1, __val.f);
_vector_shift_lastavxs(__v1, __val.f);
#define print256b(__v1) \
print256bFP(__v1)
@ -157,3 +158,31 @@
} \
}
class BitMaskVec_float {
MASK_VEC low_, high_ ;
_256_TYPE combined_ ;
public:
inline MASK_TYPE& getLowEntry(int index) {
return low_.masks[index] ;
}
inline MASK_TYPE& getHighEntry(int index) {
return high_.masks[index] ;
}
inline const _256_TYPE& getCombinedMask() {
VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
return combined_ ;
}
inline void shift_left_1bit() {
VEC_SHIFT_LEFT_1BIT(low_.vec) ;
VEC_SHIFT_LEFT_1BIT(high_.vec) ;
}
} ;
#define BITMASK_VEC BitMaskVec_float

View File

@ -0,0 +1,153 @@
#ifdef PRECISION
#undef PRECISION
#undef MAIN_TYPE
#undef MAIN_TYPE_SIZE
#undef UNION_TYPE
#undef IF_128
#undef IF_MAIN_TYPE
#undef SHIFT_CONST1
#undef SHIFT_CONST2
#undef SHIFT_CONST3
#undef _128_TYPE
#undef _256_TYPE
#undef AVX_LENGTH
#undef MAVX_COUNT
#undef HAP_TYPE
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef print256b(__v1)
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif
#define SSE
#define PRECISION d
#define MAIN_TYPE double
#define MAIN_TYPE_SIZE 64
#define UNION_TYPE mix_D128
#define IF_128 IF_128d
#define IF_MAIN_TYPE IF_64
#define SHIFT_CONST1 1
#define SHIFT_CONST2 8
#define SHIFT_CONST3 0
#define _128_TYPE __m128d
#define _256_TYPE __m128d
#define _256_INT_TYPE __m128i
#define AVX_LENGTH 2
#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
#define HAP_TYPE __m128i
#define MASK_TYPE uint64_t
#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL
#define MASK_VEC MaskVec_D
#define VEC_EXTRACT_UNIT(__v1, __im) \
_mm_extract_epi64(__v1, __im)
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
_mm_insert_epi64(__v1,__ins,__im)
#define VEC_OR(__v1, __v2) \
_mm_or_pd(__v1, __v2)
#define VEC_ADD(__v1, __v2) \
_mm_add_pd(__v1, __v2)
#define VEC_SUB(__v1, __v2) \
_mm_sub_pd(__v1, __v2)
#define VEC_MUL(__v1, __v2) \
_mm_mul_pd(__v1, __v2)
#define VEC_DIV(__v1, __v2) \
_mm_div_pd(__v1, __v2)
#define VEC_CMP_EQ(__v1, __v2) \
_mm_cmpeq_pd(__v1, __v2)
#define VEC_BLEND(__v1, __v2, __mask) \
_mm_blend_pd(__v1, __v2, __mask)
#define VEC_BLENDV(__v1, __v2, __maskV) \
_mm_blendv_pd(__v1, __v2, __maskV)
#define SHIFT_HAP(__v1, __val) \
__v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
#define VEC_CVT_128_256(__v1) \
_mm_cvtepi32_pd(__v1)
#define VEC_SET1_VAL(__val) \
_mm_set1_pd(__val)
#define VEC_POPCVT_CHAR(__ch) \
_mm_cvtepi32_pd(_mm_set1_epi32(__ch))
#define VEC_SET_LSE(__val) \
_mm_set_pd(zero, __val);
#define VEC_LDPOPCVT_CHAR(__addr) \
_mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr))
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
__vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
#define VEC_SHIFT_LEFT_1BIT(__vs) \
__vs = _mm_slli_epi64(__vs, 1)
class BitMaskVec_sse_double {
MASK_VEC combined_ ;
public:
inline MASK_TYPE& getLowEntry(int index) {
return combined_.masks[index] ;
}
inline MASK_TYPE& getHighEntry(int index) {
return combined_.masks[AVX_LENGTH/2+index] ;
}
inline const _256_TYPE& getCombinedMask() {
return combined_.vecf ;
}
inline void shift_left_1bit() {
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
}
} ;
#define BITMASK_VEC BitMaskVec_sse_double

View File

@ -0,0 +1,151 @@
#ifdef PRECISION
#undef PRECISION
#undef MAIN_TYPE
#undef MAIN_TYPE_SIZE
#undef UNION_TYPE
#undef IF_128
#undef IF_MAIN_TYPE
#undef SHIFT_CONST1
#undef SHIFT_CONST2
#undef SHIFT_CONST3
#undef _128_TYPE
#undef _256_TYPE
#undef AVX_LENGTH
#undef MAVX_COUNT
#undef HAP_TYPE
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef print256b(__v1)
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif
#define SSE
#define PRECISION s
#define MAIN_TYPE float
#define MAIN_TYPE_SIZE 32
#define UNION_TYPE mix_F128
#define IF_128 IF_128f
#define IF_MAIN_TYPE IF_32
#define SHIFT_CONST1 3
#define SHIFT_CONST2 4
#define SHIFT_CONST3 0
#define _128_TYPE __m128
#define _256_TYPE __m128
#define _256_INT_TYPE __m128i
#define AVX_LENGTH 4
#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
#define HAP_TYPE UNION_TYPE
#define MASK_TYPE uint32_t
#define MASK_ALL_ONES 0xFFFFFFFF
#define MASK_VEC MaskVec_F
#define VEC_EXTRACT_UNIT(__v1, __im) \
_mm_extract_epi32(__v1, __im)
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
_mm_insert_epi32(__v1,__ins,__im)
#define VEC_OR(__v1, __v2) \
_mm_or_ps(__v1, __v2)
#define VEC_ADD(__v1, __v2) \
_mm_add_ps(__v1, __v2)
#define VEC_SUB(__v1, __v2) \
_mm_sub_ps(__v1, __v2)
#define VEC_MUL(__v1, __v2) \
_mm_mul_ps(__v1, __v2)
#define VEC_DIV(__v1, __v2) \
_mm_div_ps(__v1, __v2)
#define VEC_CMP_EQ(__v1, __v2) \
_mm_cmpeq_ps(__v1, __v2)
#define VEC_BLEND(__v1, __v2, __mask) \
_mm_blend_ps(__v1, __v2, __mask)
#define VEC_BLENDV(__v1, __v2, __maskV) \
_mm_blendv_ps(__v1, __v2, __maskV)
#define SHIFT_HAP(__v1, __val) \
_vector_shift_lastsses(__v1, __val.f)
#define VEC_CVT_128_256(__v1) \
_mm_cvtepi32_ps(__v1.i)
#define VEC_SET1_VAL(__val) \
_mm_set1_ps(__val)
#define VEC_POPCVT_CHAR(__ch) \
_mm_cvtepi32_ps(_mm_set1_epi32(__ch))
#define VEC_SET_LSE(__val) \
_mm_set_ps(zero, zero, zero, __val);
#define VEC_LDPOPCVT_CHAR(__addr) \
_mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr))
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
__vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
#define VEC_SHIFT_LEFT_1BIT(__vs) \
__vs = _mm_slli_epi32(__vs, 1)
class BitMaskVec_sse_float {
MASK_VEC combined_ ;
public:
inline MASK_TYPE& getLowEntry(int index) {
return combined_.masks[index] ;
}
inline MASK_TYPE& getHighEntry(int index) {
return combined_.masks[AVX_LENGTH/2+index] ;
}
inline const _256_TYPE& getCombinedMask() {
return combined_.vecf ;
}
inline void shift_left_1bit() {
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
}
} ;
#define BITMASK_VEC BitMaskVec_sse_float

View File

@ -0,0 +1,25 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <ctype.h>
#include <sys/time.h>
#include <immintrin.h>
#include <emmintrin.h>
#include <omp.h>
#include <string>
#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <cmath>

View File

@ -1,451 +0,0 @@
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX -openmp -openmp-link static
#include <iostream>
#include <malloc.h>
#include <assert.h>
#include <sys/time.h>
#include <omp.h>
#include <stdlib.h>
#include <math.h>
#include <immintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#include <stdint.h>
#include <algorithm>
#include <vector>
#include <set>
#include <map>
#include <memory.h>
#include "template.h"
using namespace std ;
#define CHECK_MASK_CORRECTNESS
template <class T>
string getBinaryStr (T val, int numBitsToWrite) {
ostringstream oss ;
uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
for (int i=numBitsToWrite-1; i >= 0; --i) {
oss << ((val & mask) >> i) ;
mask >>= 1 ;
}
return oss.str() ;
}
int normalize(char c)
{
return ((int) (c - 33));
}
uint8_t ConvertChar::conversionTable[255] ;
int read_testcase(testcase *tc, FILE* ifp)
{
char *q, *i, *d, *c, *line = NULL;
int _q, _i, _d, _c;
int x, size = 0;
ssize_t read;
read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
if (read == -1)
return -1;
tc->hap = (char *) malloc(size);
tc->rs = (char *) malloc(size);
q = (char *) malloc(size);
i = (char *) malloc(size);
d = (char *) malloc(size);
c = (char *) malloc(size);
if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
return -1;
tc->haplen = strlen(tc->hap);
tc->rslen = strlen(tc->rs);
assert(tc->rslen < MROWS);
tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
tc->irs = (int *) malloc(tc->rslen*sizeof(int));
//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
for (x = 0; x < tc->rslen; x++)
{
_q = normalize(q[x]);
_i = normalize(i[x]);
_d = normalize(d[x]);
_c = normalize(c[x]);
tc->q[x] = (_q < 6) ? 6 : _q;
tc->i[x] = _i;
tc->d[x] = _d;
tc->c[x] = _c;
tc->irs[x] = tc->rs[x];
}
for (x = 0; x < tc->haplen; x++)
tc->ihap[x] = tc->hap[x];
free(q);
free(i);
free(d);
free(c);
free(line);
return 0;
}
#define ALIGN __attribute__ ((aligned(32)))
typedef union ALIGN {
//__m256i vi;
__m128 vf ;
__m128i vi ;
__m128d vd;
uint8_t b[16];
//uint16_t hw[8] ;
uint32_t w[4] ;
uint64_t dw[2] ;
float f[4] ;
//double d[2] ;
} v128 ;
typedef union ALIGN {
__m256i vi;
__m256 vf ;
__m256d vd;
//__m128i vi128[2] ;
uint8_t b[32];
//uint16_t hw[16] ;
uint32_t w[8] ;
uint64_t dw[4] ;
float f[8] ;
double d[4] ;
} v256 ;
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
#define VEC_ENTRY_CNT 8
#define VEC_LEN 256
#define VTYPE vf
#define VTYPEI vi
#define VENTRY f
#define VENTRYI w
#define MTYPE uint32_t
#define MASK_ALL_ONES 0xFFFFFFFF
#define VECTOR v256
#define VECTOR_SSE v128
#define SET_VEC_ZERO(__vec) \
__vec= _mm256_setzero_ps()
#define SET_VEC_ONES(__vec) \
__vec = _mm256_set1_epi32(0xFFFFFFFF)
#define VEC_OR(__v1, __v2) \
_mm256_or_ps(__v1, __v2)
#define VEC_ADD(__v1, __v2) \
_mm256_add_ps(__v1, __v2)
#define VEC_MUL(__v1, __v2) \
_mm256_mul_ps(__v1, __v2)
#define VEC_BLENDV(__v1, __v2, __maskV) \
_mm256_blendv_ps(__v1, __v2, __maskV)
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
__vdst = _mm256_castps128_ps256(__vsLow) ; \
__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
#define VECS_SHIFT_LEFT_1BIT(__vs) \
__vs = _mm_slli_epi32(__vs, 1)
#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) { \
uint64_t __mask = ; \
uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \
__shiftWord <<= (64-cnt) ; \
__vs = _mm_slri_epi64(__vs, __cnt) ; \
__vs.dw[0] |= __shiftWord ; \
}
#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \
MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
__mask >>= __shiftBy ; \
__mask |= __lastShiftOut ; \
__lastShiftOut = __nextShiftOut ; \
}
#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ; \
MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
__dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \
__lastShiftOut = __nextShiftOut ; \
}
void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs,
MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
for (int vi=0; vi < numMaskVecs; ++vi) {
for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
maskArr[vi][rs] = 0 ;
}
maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
}
for (int col=1; col < COLS; ++col) {
int mIndex = (col-1) / maskBitCnt ;
int mOffset = (col-1) % maskBitCnt ;
MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ;
char hapChar = tc.hap[col-1] ;
if (hapChar == AMBIG_CHAR) {
maskArr[mIndex][0] |= bitMask ;
maskArr[mIndex][1] |= bitMask ;
maskArr[mIndex][2] |= bitMask ;
maskArr[mIndex][3] |= bitMask ;
}
//cout << hapChar << " " << mIndex << " " << getBinaryStr<MTYPE>(bitMask, 32)
// << endl ;
//cout << getBinaryStr<MTYPE>(maskArr[0][hapChar],32) << endl ;
//exit(0) ;
maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
// bit corresponding to col 1 will be the MSB of the mask 0
// bit corresponding to col 2 will be the MSB-1 of the mask 0
// ...
// bit corresponding to col 32 will be the LSB of the mask 0
// bit corresponding to col 33 will be the MSB of the mask 1
// ...
}
}
void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {
int ROWS = tc.rslen + 1 ;
int COLS = tc.haplen + 1 ;
// only for testing
VECTOR mismatchData, matchData ;
//SET_VEC_ZERO(mismatchData.VTYPE) ;
//SET_VEC_ONES(matchData.VTYPEI) ;
for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) {
matchData.VENTRY[ei] = 1.0 ;
mismatchData.VENTRY[ei] = 0.0 ;
}
const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ;
#ifdef DEBUG
if (printDebug) {
cout << "The first 32 hap chars are: " ;
for (int i=0; i < 32; ++i) {
cout << tc.hap[i] ;
}
cout << endl ;
cout << "Masks computed for A, C, T, G, N are: " << endl ;
cout << getBinaryStr<MTYPE>(maskArr[0][0], 32) << endl ;
cout << getBinaryStr<MTYPE>(maskArr[0][1], 32) << endl ;
cout << getBinaryStr<MTYPE>(maskArr[0][2], 32) << endl ;
cout << getBinaryStr<MTYPE>(maskArr[0][3], 32) << endl ;
cout << getBinaryStr<MTYPE>(maskArr[0][4], 32) << endl ;
}
#endif // #ifdef DEBUG
int beginRowIndex = 1 ;
while (beginRowIndex < ROWS) {
int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ;
char rsArr[VEC_ENTRY_CNT] ;
for (int ri=0; ri < numRowsToProcess; ++ri) {
rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
}
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3
VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7
MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ;
for (int ei=0; ei < VEC_ENTRY_CNT; ++ei)
lastMaskShiftOut[ei] = 0 ;
int col = 1 ;
int diag = 1 ;
for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) {
// set up the mask vectors for the next maskBitCnt columns
// For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols)
for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) {
SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]],
lastMaskShiftOut[ei], ei, maskBitCnt) ;
int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index
SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]],
lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
}
#ifdef DEBUG
if (printDebug && maskIndex == 0) {
cout << "The masks for entry 1: " << endl
<< getBinaryStr<MTYPE>(maskArr[0][rsArr[1]], 32) << endl
<< getBinaryStr<MTYPE>(currMaskVecLow.VENTRYI[1], 32) << endl ;
}
#endif // #ifdef DEBUG
// iterate over mask bit indices and columns
for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) {
VECTOR maskV ;
VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ;
VECTOR testData ;
testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE,
maskV.VTYPE) ;
VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ;
VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ;
#ifdef DEBUG
if (printDebug && maskIndex == 0) {
cout << "The mask for entry 1, mbi=" << mbi << ": "
<< getBinaryStr<MTYPE>(maskV.VENTRYI[1], 32) << endl ;
}
#endif // #ifdef DEBUG
#ifdef CHECK_MASK_CORRECTNESS
int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ;
int lastRowIndex = min(col-1, numRowsToProcess-1) ;
for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) {
int currRow = beginRowIndex + ri ;
int currCol = col - ri + firstRowIndex ;
char hapChar = tc.hap[currCol-1] ;
char rsChar = tc.rs[currRow-1] ;
bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ;
if ((bool) testData.VENTRYI[ri] != match) {
cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag
<< " (" << currRow << ", " << currCol << ")" << endl ;
cout << "The chars are: " << hapChar << " and " << rsChar << endl ;
cout << "The selected value is: " << testData.VENTRYI[ri] << endl ;
exit(0) ;
}
}
#endif // #ifdef CHECK_MASK_CORRECTNESS
if (diag < COLS)
++col ;
} // mbi
} // maskIndex
beginRowIndex += VEC_ENTRY_CNT ;
} // end of stripe
//cout << "Finished validating entry " << endl ;
}
#ifdef HMM_MASK_MAIN
int main () {
#define BATCH_SIZE 10000
ConvertChar::init() ;
testcase* tcBatch = new testcase[BATCH_SIZE] ;
int numBatches = 0 ;
int numRead = 0 ;
const int DEBUG_TC = -1 ;
FILE* ifp = stdin ;
do {
numRead = 0 ;
while (numRead < BATCH_SIZE) {
if (read_testcase(tcBatch+numRead, ifp) < 0)
break ;
++numRead ;
}
if (numRead == 0)
break ;
for (int ti=0; ti < numRead; ++ti) {
int tcID = numBatches * BATCH_SIZE + ti ;
test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ;
}
++numBatches ;
} while (numRead == BATCH_SIZE) ;
fclose(ifp) ;
delete[] tcBatch ;
return 0 ;
}
#endif

Binary file not shown.

View File

@ -1,70 +1,19 @@
#include "headers.h"
#include <jni.h>
#include <assert.h>
#include <stdio.h>
#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
#include <vector>
#include <cstdio>
#include <string>
#include <cstring>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <fstream>
#include <sstream>
#include <immintrin.h>
#include <emmintrin.h>
#include <omp.h>
using namespace std;
#define ENABLE_ASSERTIONS 1
#define DO_PROFILING 1
//#define DEBUG 1
//#define DEBUG0_1 1
//#define DEBUG3 1
#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
#include "define-double.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
class LoadTimeInitializer
{
public:
LoadTimeInitializer() //will be called when library is loaded
{
ConvertChar::init();
}
jfieldID m_readBasesFID;
jfieldID m_readQualsFID;
jfieldID m_insertionGOPFID;
jfieldID m_deletionGOPFID;
jfieldID m_overallGCPFID;
jfieldID m_haplotypeBasesFID;
};
LoadTimeInitializer g_load_time_initializer;
void debug_dump(string filename, string s, bool to_append, bool add_newline)
{
ofstream fptr;
fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
fptr << s;
if(add_newline)
fptr << "\n";
fptr.close();
}
using namespace std;
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
(JNIEnv* env, jclass thisObject,
@ -93,7 +42,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior
//Gets direct access to Java arrays
#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
#define JNI_RELEASE_MODE JNI_ABORT
#define JNI_RO_RELEASE_MODE JNI_ABORT
#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
@ -101,7 +50,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior
//Likely makes copy of Java arrays to JNI C++ space
#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements
#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements
#define JNI_RELEASE_MODE JNI_ABORT
#define JNI_RO_RELEASE_MODE JNI_ABORT
#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements
#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements
@ -145,19 +94,19 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadL
tc.c[i] = (int)overallGCPArray[i];
}
double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
double result_avxd = g_compute_full_prob_double(&tc, 0);
double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
#ifdef DEBUG
debug_dump("return_values_jni.txt",to_string(result),true);
g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(result),true);
#endif
RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RO_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RO_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RO_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RO_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RO_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RO_RELEASE_MODE);
return 0.0;
}
@ -191,6 +140,46 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
g_load_time_initializer.m_haplotypeBasesFID = fid;
}
//Since the list of haplotypes against which the reads are evaluated in PairHMM is the same for a region,
//transfer the list only once
vector<pair<jbyteArray, jbyte*> > g_haplotypeBasesArrayVector;
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes
(JNIEnv * env, jobject thisObject, jint numHaplotypes, jobjectArray haplotypeDataArray)
{
jboolean is_copy = JNI_FALSE;
//To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
haplotypeBasesArrayVector.clear();
haplotypeBasesArrayVector.resize(numHaplotypes);
for(unsigned j=0;j<numHaplotypes;++j)
{
jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
#ifdef ENABLE_ASSERTIONS
assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
#endif
//Need a global reference as this will be accessed across multiple JNI calls to JNIComputeLikelihoods()
jbyteArray haplotypeBasesGlobalRef = (jbyteArray)env->NewGlobalRef(haplotypeBases);
#ifdef ENABLE_ASSERTIONS
assert(haplotypeBasesGlobalRef && ("Could not get global ref to haplotypeBases at index : "+to_string(j)+"\n").c_str());
#endif
env->DeleteLocalRef(haplotypeBases); //free the local reference
jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBasesGlobalRef, &is_copy);
#ifdef ENABLE_ASSERTIONS
assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI");
assert(env->GetArrayLength(haplotypeBasesGlobalRef) < MCOLS);
#endif
#ifdef DEBUG0_1
cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBasesGlobalRef)<<"\n";
#endif
haplotypeBasesArrayVector[j] = make_pair(haplotypeBasesGlobalRef, haplotypeBasesArray);
#ifdef DEBUG3
for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
g_load_time_initializer.debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
#endif
}
}
//JNI function to invoke compute_full_prob_avx
//readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc
//haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases
@ -200,37 +189,16 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
(JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes,
jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse)
{
jboolean is_copy = JNI_FALSE;
//To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
vector<pair<jbyteArray, jbyte*> > haplotypeBasesArrayVector;
haplotypeBasesArrayVector.clear();
haplotypeBasesArrayVector.resize(numHaplotypes);
#ifdef DEBUG0_1
cout << "JNI numReads "<<numReads<<" numHaplotypes "<<numHaplotypes<<"\n";
#endif
for(unsigned j=0;j<numHaplotypes;++j)
{
jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
#ifdef ENABLE_ASSERTIONS
assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
#endif
jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
#ifdef ENABLE_ASSERTIONS
assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI");
assert(env->GetArrayLength(haplotypeBases) < MCOLS);
#endif
#ifdef DEBUG0_1
cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBases)<<"\n";
#endif
haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray);
#ifdef DEBUG3
for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
#endif
}
double start_time = 0;
//haplotype vector from earlier store - note the reference to vector, not copying
vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
jboolean is_copy = JNI_FALSE;
unsigned numTestCases = numReads*numHaplotypes;
//vector to store results
vector<testcase> tc_array;
tc_array.clear();
tc_array.resize(numTestCases);
@ -239,6 +207,9 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
vector<vector<pair<jbyteArray,jbyte*> > > readBasesArrayVector;
readBasesArrayVector.clear();
readBasesArrayVector.resize(numReads);
#ifdef DO_PROFILING
start_time = get_time();
#endif
for(unsigned i=0;i<numReads;++i)
{
//Get bytearray fields from read
@ -281,11 +252,11 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
#ifdef DEBUG3
for(unsigned j=0;j<readLength;++j)
{
debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
}
#endif
@ -295,63 +266,115 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
tc_array[tc_idx].rslen = (int)readLength;
tc_array[tc_idx].haplen = (int)haplotypeLength;
tc_array[tc_idx].rs = (char*)readBasesArray;
tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
//Can be avoided
for(unsigned k=0;k<readLength;++k)
{
tc_array[tc_idx].q[k] = (int)readQualsArray[k];
tc_array[tc_idx].i[k] = (int)insertionGOPArray[k];
tc_array[tc_idx].d[k] = (int)deletionGOPArray[k];
tc_array[tc_idx].c[k] = (int)overallGCPArray[k];
}
tc_array[tc_idx].rs = (char*)readBasesArray;
tc_array[tc_idx].q = (char*)readQualsArray;
tc_array[tc_idx].i = (char*)insertionGOPArray;
tc_array[tc_idx].d = (char*)deletionGOPArray;
tc_array[tc_idx].c = (char*)overallGCPArray;
++tc_idx;
}
RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE); //order of GET-RELEASE is important
RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
//Release readBases at end because it is used by compute_full_prob
//Release read arrays at end because they are used by compute_full_prob
//Maintain order in which GET_BYTE_ARRAY_ELEMENTS called
readBasesArrayVector[i].clear();
readBasesArrayVector[i].resize(1);
readBasesArrayVector[i].resize(5);
readBasesArrayVector[i][0] = make_pair(readBases, readBasesArray);
readBasesArrayVector[i][1] = make_pair(readQuals, readQualsArray);
readBasesArrayVector[i][2] = make_pair(insertionGOP, insertionGOPArray);
readBasesArrayVector[i][3] = make_pair(deletionGOP, deletionGOPArray);
readBasesArrayVector[i][4] = make_pair(overallGCP, overallGCPArray);
}
#ifdef DO_PROFILING
g_load_time_initializer.m_data_transfer_time += get_time();
#endif
jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy);
#ifdef ENABLE_ASSERTIONS
assert(likelihoodDoubleArray && "likelihoodArray is NULL");
assert(env->GetArrayLength(likelihoodArray) == numTestCases);
#endif
#ifdef DO_PROFILING
start_time = get_time();
#endif
#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse)
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
{
double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
likelihoodDoubleArray[tc_idx] = result;
}
#ifdef DO_PROFILING
g_load_time_initializer.m_compute_time += get_time();
#endif
#ifdef DEBUG
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
{
debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
}
#endif
RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0);
#ifdef DO_PROFILING
start_time = get_time();
#endif
RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory
//Release read arrays first
for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
{
for(int j=readBasesArrayVector.size()-1;j>=0;--j)
RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE);
for(int j=readBasesArrayVector[i].size()-1;j>=0;--j)
RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE);
readBasesArrayVector[i].clear();
}
readBasesArrayVector.clear();
//Now release haplotype arrays
for(int j=numHaplotypes-1;j>=0;--j) //note the order - reverse of GET
RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE);
haplotypeBasesArrayVector.clear();
#ifdef DO_PROFILING
g_load_time_initializer.m_data_transfer_time += get_time();
#endif
tc_array.clear();
#ifdef DO_PROFILING
g_load_time_initializer.m_sumNumReads += numReads;
g_load_time_initializer.m_sumSquareNumReads += numReads*numReads;
g_load_time_initializer.m_sumNumHaplotypes += numHaplotypes;
g_load_time_initializer.m_sumSquareNumHaplotypes += numHaplotypes*numHaplotypes;
g_load_time_initializer.m_sumNumTestcases += numTestCases;
g_load_time_initializer.m_sumSquareNumTestcases += numTestCases*numTestCases;
g_load_time_initializer.m_maxNumTestcases = numTestCases > g_load_time_initializer.m_maxNumTestcases ? numTestCases
: g_load_time_initializer.m_maxNumTestcases;
++(g_load_time_initializer.m_num_invocations);
#endif
#ifdef DEBUG
g_load_time_initializer.debug_close();
#endif
}
//Release haplotypes at the end of a region
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion
(JNIEnv * env, jobject thisObject)
{
vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
//Now release haplotype arrays
for(int j=haplotypeBasesArrayVector.size()-1;j>=0;--j) //note the order - reverse of GET
{
RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RO_RELEASE_MODE);
env->DeleteGlobalRef(haplotypeBasesArrayVector[j].first); //free the global reference
}
haplotypeBasesArrayVector.clear();
}
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose
(JNIEnv* env, jobject thisObject)
{
#ifdef DO_PROFILING
g_load_time_initializer.print_profiling();
#endif
#ifdef DEBUG
g_load_time_initializer.debug_close();
#endif
}

View File

@ -21,6 +21,34 @@ extern "C" {
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug 0L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify 0L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1 0L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1 0L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2 0L
#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3
#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3 0L
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniGlobalInit
* Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
(JNIEnv *, jobject, jclass, jclass);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniClose
* Signature: ()V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose
(JNIEnv *, jobject);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniInitialize
@ -45,27 +73,34 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells
(JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10
* Signature: (II[B[B[B[B[B[BI)D
*/
JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10
(JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
(JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniGlobalInit
* Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
* Method: jniInitializeHaplotypes
* Signature: (I[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;)V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
(JNIEnv *, jobject, jclass, jclass);
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes
(JNIEnv *, jobject, jint, jobjectArray);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniFinalizeRegion
* Signature: ()V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion
(JNIEnv *, jobject);
/*
* Class: org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
* Method: jniComputeLikelihoods
* Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V
* Signature: (II[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[DI)V
*/
JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
(JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint);

View File

@ -1,123 +1,19 @@
//#define DEBUG 1
//#define DEBUG0_1 1
//#define DEBUG3 1
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <cmath>
#include <omp.h>
#include <emmintrin.h>
#include "headers.h"
#include "template.h"
//#include "define-float.h"
//#include "shift_template.c"
//#include "pairhmm-template-kernel.cc"
#include "define-double.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
class LoadTimeInitializer
{
public:
LoadTimeInitializer() //will be called when library is loaded
{
ConvertChar::init();
}
};
LoadTimeInitializer g_load_time_initializer;
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
{
int r, c;
int ROWS = tc->rslen + 1;
int COLS = tc->haplen + 1;
Context<NUMBER> ctx;
NUMBER M[MROWS][MCOLS];
NUMBER X[MROWS][MCOLS];
NUMBER Y[MROWS][MCOLS];
NUMBER p[MROWS][6];
p[0][MM] = ctx._(0.0);
p[0][GapM] = ctx._(0.0);
p[0][MX] = ctx._(0.0);
p[0][XX] = ctx._(0.0);
p[0][MY] = ctx._(0.0);
p[0][YY] = ctx._(0.0);
for (r = 1; r < ROWS; r++)
{
int _i = tc->i[r-1] & 127;
int _d = tc->d[r-1] & 127;
int _c = tc->c[r-1] & 127;
p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
p[r][MX] = ctx.ph2pr[_i];
p[r][XX] = ctx.ph2pr[_c];
p[r][MY] = ctx.ph2pr[_d];
p[r][YY] = ctx.ph2pr[_c];
//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
}
for (c = 0; c < COLS; c++)
{
M[0][c] = ctx._(0.0);
X[0][c] = ctx._(0.0);
Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
}
for (r = 1; r < ROWS; r++)
{
M[r][0] = ctx._(0.0);
X[r][0] = X[r-1][0] * p[r][XX];
Y[r][0] = ctx._(0.0);
}
NUMBER result = ctx._(0.0);
for (r = 1; r < ROWS; r++)
for (c = 1; c < COLS; c++)
{
char _rs = tc->rs[r-1];
char _hap = tc->hap[c-1];
int _q = tc->q[r-1] & 127;
NUMBER distm = ctx.ph2pr[_q];
if (_rs == _hap || _rs == 'N' || _hap == 'N')
distm = ctx._(1.0) - distm;
else
distm = distm/3;
M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
}
for (c = 0; c < COLS; c++)
{
result += M[ROWS-1][c] + X[ROWS-1][c];
}
if (before_last_log != NULL)
*before_last_log = result;
return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
}
#define BATCH_SIZE 10000
#define RUN_HYBRID
int main(int argc, char** argv)
{
if(argc < 2)
@ -128,111 +24,82 @@ int main(int argc, char** argv)
bool use_old_read_testcase = false;
if(argc >= 3 && string(argv[2]) == "1")
use_old_read_testcase = true;
unsigned chunk_size = 100;
if(argc >= 4)
chunk_size = strtol(argv[3],0,10);
testcase tc;
std::ifstream ifptr;
FILE* fptr = 0;
if(use_old_read_testcase)
{
FILE* fptr = fopen(argv[1],"r");
while(!feof(fptr))
{
if(read_testcase(&tc, fptr) >= 0)
{
double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
delete tc.rs;
delete tc.hap;
}
}
fclose(fptr);
fptr = fopen(argv[1],"r");
assert(fptr);
}
else
{
std::ifstream ifptr;
std::vector<std::string> tokens;
ifptr.open(argv[1]);
assert(ifptr.is_open());
while(1)
{
tokens.clear();
if(read_mod_testcase(ifptr, &tc, false) < 0)
break;
//double result = 0;
double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
delete tc.rs;
delete tc.hap;
}
ifptr.close();
}
return 0;
#if 0
float result[BATCH_SIZE], result_avxf;
double result_avxd;
struct timeval start, end;
long long aggregateTimeRead = 0L;
long long aggregateTimeCompute = 0L;
long long aggregateTimeWrite = 0L;
bool noMoreData = false;
int count =0;
while (!noMoreData)
vector<testcase> tc_vector;
tc_vector.clear();
testcase tc;
uint64_t total_time = 0;
while(1)
{
int read_count = BATCH_SIZE;
gettimeofday(&start, NULL);
for (int b=0;b<BATCH_SIZE;b++)
if (read_testcase(&tc[b], NULL)==-1)
{
read_count = b;
noMoreData = true;
break;
}
gettimeofday(&end, NULL);
aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
gettimeofday(&start, NULL);
for (int b=0;b<read_count;b++)
int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true);
if(break_value >= 0)
tc_vector.push_back(tc);
if(tc_vector.size() == BATCH_SIZE || (break_value < 0 && tc_vector.size() > 0))
{
result_avxf = compute_full_prob<float>(&tc[b]);
vector<double> results_vec;
results_vec.clear();
results_vec.resize(tc_vector.size());
get_time();
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
for(unsigned i=0;i<tc_vector.size();++i)
{
testcase& tc = tc_vector[i];
float result_avxf = g_compute_full_prob_float(&tc, 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&tc, 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
#ifdef RUN_HYBRID
#define MIN_ACCEPTED 1e-28f
if (result_avxf < MIN_ACCEPTED) {
count++;
result_avxd = compute_full_prob<double>(&tc[b]);
result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
results_vec[i] = result;
}
else
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
#endif
#ifndef RUN_HYBRID
result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
#endif
total_time += get_time();
#pragma omp parallel for schedule(dynamic,chunk_size)
for(unsigned i=0;i<tc_vector.size();++i)
{
testcase& tc = tc_vector[i];
double baseline_result = compute_full_prob<double>(&tc);
baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0));
double abs_error = fabs(baseline_result-results_vec[i]);
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
if(abs_error > 1e-5 && rel_error > 1e-5)
cout << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
delete tc_vector[i].rs;
delete tc_vector[i].hap;
delete tc_vector[i].q;
delete tc_vector[i].i;
delete tc_vector[i].d;
delete tc_vector[i].c;
}
results_vec.clear();
tc_vector.clear();
}
gettimeofday(&end, NULL);
aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
gettimeofday(&start, NULL);
for (int b=0;b<read_count;b++)
printf("%E\n", result[b]);
gettimeofday(&end, NULL);
aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
if(break_value < 0)
break;
}
printf("AVX Read Time: %ld\n", aggregateTimeRead);
printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
printf("AVX Write Time: %ld\n", aggregateTimeWrite);
printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
printf("# Double called: %d\n", count);
return 0;
#endif
cout << "Total time "<< ((double)total_time)/1e9 << "\n";
if(use_old_read_testcase)
fclose(fptr);
else
ifptr.close();
return 0;
}

View File

@ -4,6 +4,10 @@
#include <assert.h>
#include <stdlib.h>
//#define DEBUG
#define MUSTAFA
#define KARTHIK
/*
template <class T>
string getBinaryStr (T val, int numBitsToWrite) {
@ -17,8 +21,10 @@ string getBinaryStr (T val, int numBitsToWrite) {
return oss.str() ;
}
*/
#ifdef MUSTAFA
void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
const int maskBitCnt = MAIN_TYPE_SIZE ;
@ -34,14 +40,14 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i
int mOffset = (col-1) % maskBitCnt ;
MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
char hapChar = tc.hap[col-1] ;
char hapChar = ConvertChar::get(tc.hap[col-1]);
if (hapChar == AMBIG_CHAR) {
for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci)
maskArr[mIndex][ci] |= bitMask ;
}
maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
maskArr[mIndex][hapChar] |= bitMask ;
// bit corresponding to col 1 will be the MSB of the mask 0
// bit corresponding to col 2 will be the MSB-1 of the mask 0
// ...
@ -52,7 +58,7 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i
}
void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
for (int ri=0; ri < numRowsToProcess; ++ri) {
rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
@ -63,6 +69,7 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA
}
}
#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \
MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
@ -71,33 +78,37 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA
}
void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) {
void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]],
SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]],
lastMaskShiftOut[ei], ei, maskBitCnt) ;
int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]],
SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]],
lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
}
}
//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
//#define computeDistVec() {
_256_TYPE maskV ;
VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ;
#ifdef DEBUGG
long long *temp1 = (long long *)(&maskV);
double *temp2 = (double *)(&distm);
double *temp3 = (double *)(&_1_distm);
printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
#endif
distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ;
distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
/*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ;
VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ;
bitMaskVec.shift_left_1bit() ;
}
@ -120,8 +131,9 @@ struct HmmData {
} ;
*/
#endif // MUSTAFA
template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
template<class NUMBER> void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
{
NUMBER zero = ctx._(0.0);
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
@ -157,18 +169,14 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
*(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
*(ptr_p_MX+r-1) = ctx.ph2pr[_i];
*(ptr_p_XX+r-1) = ctx.ph2pr[_c];
*(ptr_p_MY+r-1) = ctx.ph2pr[_d];
*(ptr_p_YY+r-1) = ctx.ph2pr[_c];
#ifdef DEBUG3
debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true);
debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true);
debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true);
debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true);
debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true);
debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true);
#endif
//*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
//*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
#ifdef KARTHIK
*(ptr_p_MY+r-1) = ctx.ph2pr[_d];
*(ptr_p_YY+r-1) = ctx.ph2pr[_c];
#else
*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
#endif
}
NUMBER *ptr_distm1D = (NUMBER *)distm1D;
@ -176,14 +184,11 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
{
int _q = tc->q[r-1] & 127;
ptr_distm1D[r-1] = ctx.ph2pr[_q];
#ifdef DEBUG3
debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true);
#endif
}
}
template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)(
int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM ,
_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
@ -200,19 +205,18 @@ template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)
NUMBER zero = ctx._(0.0);
NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0);
#define TRISTATE_CORRECTION_FACTOR 3.0
UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR);
UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0);
/* compare rs and N */
//rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));
//rsN.d = VEC_CMP_EQ(N_packed256, rs);
#ifndef MUSTAFA
rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));
rsN.d = VEC_CMP_EQ(N_packed256, rs);
#endif
distm = distm1D[i];
_1_distm = VEC_SUB(packed1.d, distm);
#ifndef DO_NOT_USE_TRISTATE_CORRECTION
distm = VEC_DIV(distm, packed3.d);
#endif
_1_distm = VEC_SUB(packed1.d, distm);
#ifdef KARTHIK
distm = VEC_DIV(distm, packed3.d);
#endif
/* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */
M_t_2.d = VEC_SET1_VAL(zero);
X_t_2.d = VEC_SET1_VAL(zero);
@ -234,7 +238,7 @@ template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)
inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256,
inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256,
_256_TYPE distm, _256_TYPE _1_distm)
{
UNION_TYPE hapN, rshap;
@ -263,12 +267,21 @@ inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcas
}
inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
{
/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
#ifdef DEBUG
double *temp1 = (double *)(&pGAPM);
double *temp2 = (double *)(&pMM);
double *temp3 = (double *)(&distmSel);
printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
//printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]);
printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]);
#endif
M_t_y = M_t;
/* Compute X_t */
@ -278,7 +291,7 @@ inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_
Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
}
template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
{
_256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT];
_256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT];
@ -306,105 +319,114 @@ template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t
int remainingRows = (ROWS-1) % AVX_LENGTH;
int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
#ifdef MUSTAFA
const int maskBitCnt = MAIN_TYPE_SIZE ;
const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
char rsArr[AVX_LENGTH] ;
MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
GEN_INTRINSIC(initializeVectors, PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY,
#endif
GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY,
ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
//for (int __ii=0; __ii < 10; ++__ii)
for (int i=0;i<strip_cnt-1;i++)
{
//STRIP_INITIALIZATION
GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
i*AVX_LENGTH+1, AVX_LENGTH) ;
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
#endif
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
MASK_VEC currMaskVecLow ; // corresponding to lower half
MASK_VEC currMaskVecHigh ; // corresponding to upper half
for (int d=1;d<COLS+AVX_LENGTH;d++)
BITMASK_VEC bitMaskVec ;
for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
{
if (d % MAIN_TYPE_SIZE == 1)
GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
#endif
int ShiftIdx = d+AVX_LENGTH;
//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
// if (d % MAIN_TYPE_SIZE == 1)
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
#else
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
#endif
int ShiftIdx = begin_d + mbi + AVX_LENGTH;
//int firstRowIndex = (d < COLS) ? 0 : (d-COLS+1) ;
//int lastRowIndex = std::min(d-1, AVX_LENGTH-1) ;
//GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen, distmSel, firstRowIndex, lastRowIndex) ;
GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
GEN_INTRINSIC(_vector_shift, PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[d]);
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
GEN_INTRINSIC(_vector_shift, PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[d]);
GEN_INTRINSIC(_vector_shift, PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[d]);
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
}
}
}
int i = strip_cnt-1;
{
//STRIP_INITIALIZATION
GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
if (remainingRows==0) remainingRows=AVX_LENGTH;
GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
i*AVX_LENGTH+1, remainingRows) ;
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;
#endif
_256_TYPE sumM, sumX;
sumM = VEC_SET1_VAL(zero);
sumX = VEC_SET1_VAL(zero);
// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
MASK_VEC currMaskVecLow ; // corresponding to lower half
MASK_VEC currMaskVecHigh ; // corresponding to upper half
BITMASK_VEC bitMaskVec ;
for (int d=1;d<COLS+remainingRows-1;d++)
for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
{
if (d % MAIN_TYPE_SIZE == 1)
GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
#endif
int ShiftIdx = d+AVX_LENGTH;
//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
#ifdef MUSTAFA
GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
#else
distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
#endif
int ShiftIdx = begin_d + mbi +AVX_LENGTH;
sumM = VEC_ADD(sumM, M_t.d);
GEN_INTRINSIC(_vector_shift_last, PRECISION)(M_t, shiftOutM[ShiftIdx]);
GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
sumX = VEC_ADD(sumX, X_t.d);
GEN_INTRINSIC(_vector_shift_last, PRECISION)(X_t, shiftOutX[ShiftIdx]);
sumM = VEC_ADD(sumM, M_t.d);
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
GEN_INTRINSIC(_vector_shift_last, PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
sumX = VEC_ADD(sumX, X_t.d);
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
}
M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
}
}
UNION_TYPE sumMX;
sumMX.d = VEC_ADD(sumM, sumX);
result_avx2 = sumMX.f[remainingRows-1];

View File

@ -1,35 +1,57 @@
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include <omp.h>
#include "headers.h"
#include "template.h"
#include "vector_defs.h"
#include "define-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#define SIMD_TYPE avx
#define SIMD_TYPE_AVX
#include "define-double.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#define BATCH_SIZE 10000
//#define RUN_HYBRID
#define RUN_HYBRID
//uint8_t ConvertChar::conversionTable[255] ;
double getCurrClk();
int thread_level_parallelism_enabled = false ;
double getCurrClk() {
struct timeval tv ;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
void print128b_F(__m128 x)
{
float *p = (float *)(&x);
for (int i=3;i>=0;i--)
printf("%f ", *(p+i));
printf("\n");
}
void print128b_D(__m128d x)
{
double *p = (double *)(&x);
for (int i=1;i>=0;i--)
printf("%f ", *(p+i));
printf("\n");
}
int main()
{
testcase tc[BATCH_SIZE];
/*
IF_128f x;
x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
IF_32 shiftIn, shiftOut;
shiftIn.f = 5.0f;
print128b_F(x.f);
GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut);
print128b_F(x.f);
IF_128d y;
y.f = _mm_set_pd(10.0, 11.0);
IF_64 shiftInd, shiftOutd;
shiftInd.f = 12.0;
print128b_D(y.f);
GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd);
print128b_D(y.f);
exit(0);
*/
testcase* tc = new testcase[BATCH_SIZE];
float result[BATCH_SIZE], result_avxf;
double result_avxd;
//struct timeval start, end;
@ -40,12 +62,12 @@ int main()
// Need to call it once to initialize the static array
ConvertChar::init() ;
char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
thread_level_parallelism_enabled = true ;
}
// char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
// if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
// thread_level_parallelism_enabled = true ;
// }
bool noMoreData = false;
int count =0;
@ -55,7 +77,7 @@ int main()
lastClk = getCurrClk() ;
for (int b=0;b<BATCH_SIZE;b++)
if (read_testcase(&tc[b], NULL)==-1)
if (read_testcase(&tc[b])==-1)
{
read_count = b;
noMoreData = true;
@ -68,16 +90,17 @@ int main()
//gettimeofday(&start, NULL);
lastClk = getCurrClk() ;
#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
for (int b=0;b<read_count;b++)
{
result_avxf = GEN_INTRINSIC(compute_full_prob_avx, s)<float>(&tc[b]);
result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)<float>(&tc[b]);
#ifdef RUN_HYBRID
#define MIN_ACCEPTED 1e-28f
if (result_avxf < MIN_ACCEPTED) {
//printf("**************** RUNNING DOUBLE ******************\n");
count++;
result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc[b]);
result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)<double>(&tc[b]);
result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
}
else
@ -95,14 +118,15 @@ int main()
//gettimeofday(&start, NULL);
lastClk = getCurrClk() ;
for (int b=0;b<read_count;b++)
printf("%E\n", result[b]);
//for (int b=0;b<read_count;b++)
//printf("%E\n", result[b]);
//gettimeofday(&end, NULL);
aggregateTimeWrite += (getCurrClk() - lastClk) ;
//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
}
delete tc;
printf("AVX Read Time: %.2f\n", aggregateTimeRead);
printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
printf("AVX Write Time: %.2f\n", aggregateTimeWrite);

View File

@ -1,16 +1,25 @@
#!/bin/bash
rm -f *.txt
export GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable
rm -f *.txt *.log
GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable
#-Djava.library.path is needed if you are using JNI_LOGLESS_CACHING, else not needed
java -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar ${GSA_ROOT_DIR}/dist/GenomeAnalysisTK.jar -T HaplotypeCaller \
-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
-I /data/simulated/sim1M_pairs_final.bam \
java -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar $GSA_ROOT_DIR/dist/GenomeAnalysisTK.jar -T HaplotypeCaller \
-R /data/broad/samples/joint_variant_calling/broad_reference/human_g1k_v37_decoy.fasta \
-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
-stand_call_conf 50.0 \
-stand_emit_conf 10.0 \
--pair_hmm_implementation JNI_LOGLESS_CACHING \
-o output.raw.snps.indels.vcf
#-XL unmapped \
#--pair_hmm_implementation JNI_LOGLESS_CACHING \
#-I /data/simulated/sim1M_pairs_final.bam \
#-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
#-I /data/broad/samples/joint_variant_calling/NA12878_high_coverage_alignment/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam \
#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly18.fasta \
#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
#-R /data/broad/samples/joint_variant_calling/broad_reference/ucsc.hg19.fasta \
#-R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \
#-R /data/broad/samples/joint_variant_calling/broad_reference/human_g1k_v37_decoy.fasta \
#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/dbsnp_138.hg19.vcf \

View File

@ -1,9 +1,8 @@
#ifdef PRECISION
#include "template.h"
#ifdef SIMD_TYPE_AVX
inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
{
IF_128 xlow , xhigh;
@ -33,7 +32,8 @@ inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE sh
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
}
inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
{
IF_128 xlow , xhigh;
@ -44,10 +44,6 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY
/* extract xlow[3] */
IF_128 shiftOutL128;
shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
/* extract xhigh[3] */
// IF_MAIN_TYPE shiftOutH;
// shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
// shiftOut = shiftOutH.f;
/* shift xlow */
xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
/* shift xhigh */
@ -63,6 +59,32 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY
x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
}
#endif
#ifdef SIMD_TYPE_SSE
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
{
IF_MAIN_TYPE tempIn, tempOut;
tempIn.f = shiftIn;
/* extratc H */
tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
shiftOut = tempOut.f;
/* shift */
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
/* insert L */
x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
}
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
{
IF_MAIN_TYPE temp; temp.f = shiftIn;
/* shift */
x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
/* insert L */
x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
}
#endif
#endif

View File

@ -0,0 +1,18 @@
#include "template.h"
#undef SIMD_TYPE
#undef SIMD_TYPE_AVX
#define SIMD_TYPE sse
#define SIMD_TYPE_SSE
#include "define-sse-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
#include "define-sse-double.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"
template double compute_full_prob_ssed<double>(testcase* tc, double* nextlog);
template float compute_full_prob_sses<float>(testcase* tc, float* nextlog);

View File

@ -1,24 +1,14 @@
#ifndef TEMPLATES_H_
#define TEMPLATES_H_
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <stdint.h>
#include <assert.h>
#include <sys/time.h>
#include <immintrin.h>
#include <ctype.h>
#include <string>
#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include "headers.h"
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
#define MROWS 500
#define MCOLS 1000
@ -35,12 +25,25 @@ typedef union __attribute__((aligned(32))) {
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_F ALIGNED;
typedef union __attribute__((aligned(32))) {
ALIGNED __m128 ALIGNED d;
ALIGNED __m64 ALIGNED s[2];
ALIGNED float ALIGNED f[4];
ALIGNED __m128i ALIGNED i;
} ALIGNED mix_F128 ALIGNED;
typedef union ALIGNED {
__m128i vec ;
__m128 vecf ;
uint32_t masks[4] ;
} MaskVec_F ;
typedef union ALIGNED {
__m64 vec ;
__m64 vecf ;
uint32_t masks[2] ;
} MaskVec_F128 ;
typedef union ALIGNED
{
ALIGNED __m128i ALIGNED i;
@ -60,12 +63,25 @@ typedef union __attribute__((aligned(32))) {
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_D ALIGNED;
typedef union __attribute__((aligned(32))) {
ALIGNED __m128d ALIGNED d;
ALIGNED __m64 ALIGNED s[2];
ALIGNED double ALIGNED f[2];
ALIGNED __m128i ALIGNED i;
} ALIGNED mix_D128 ALIGNED;
typedef union ALIGNED {
__m128i vec ;
__m128d vecf ;
uint64_t masks[2] ;
} MaskVec_D ;
typedef union ALIGNED {
__m64 vec ;
__m64 vecf ;
uint64_t masks[1] ;
} MaskVec_D128 ;
typedef union ALIGNED
{
ALIGNED __m128i ALIGNED i;
@ -134,30 +150,18 @@ typedef struct
{
int rslen, haplen;
/*int *q, *i, *d, *c;*/
int q[MROWS], i[MROWS], d[MROWS], c[MROWS];
/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
char *q, *i, *d, *c;
char *hap, *rs;
int *ihap;
int *irs;
} testcase;
template<class T>
std::string to_string(T obj)
{
std::stringstream ss;
std::string ret_string;
ss.clear();
ss << std::scientific << obj;
ss >> ret_string;
ss.clear();
return ret_string;
}
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
int normalize(char c);
int read_testcase(testcase *tc, FILE* ifp);
int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
int read_testcase(testcase *tc, FILE* ifp=0);
#define MIN_ACCEPTED 1e-28f
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
@ -182,7 +186,7 @@ public:
return conversionTable[input] ;
}
} ;
};
#endif

View File

@ -1,7 +1,62 @@
#include "headers.h"
#include "template.h"
#include "utils.h"
#include "vector_defs.h"
uint8_t ConvertChar::conversionTable[255];
float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
using namespace std;
bool is_avx_supported()
{
int ecx = 0, edx = 0, ebx = 0;
__asm__("cpuid"
: "=b" (ebx),
"=c" (ecx),
"=d" (edx)
: "a" (1)
);
return ((ecx >> 28)&1) == 1;
}
bool is_sse42_supported()
{
int ecx = 0, edx = 0, ebx = 0;
__asm__("cpuid"
: "=b" (ebx),
"=c" (ecx),
"=d" (edx)
: "a" (1)
);
return ((ecx >> 20)&1) == 1;
}
void initialize_function_pointers()
{
//if(false)
if(is_avx_supported())
{
cout << "Using AVX accelerated implementation of PairHMM\n";
g_compute_full_prob_float = compute_full_prob_avxs<float>;
g_compute_full_prob_double = compute_full_prob_avxd<double>;
}
else
if(is_sse42_supported())
{
cout << "Using SSE4.2 accelerated implementation of PairHMM\n";
g_compute_full_prob_float = compute_full_prob_sses<float>;
g_compute_full_prob_double = compute_full_prob_ssed<double>;
}
else
{
cout << "Using un-vectorized C++ implementation of PairHMM\n";
g_compute_full_prob_float = compute_full_prob<float>;
g_compute_full_prob_double = compute_full_prob<double>;
}
}
int normalize(char c)
{
return ((int) (c - 33));
@ -35,10 +90,10 @@ int read_testcase(testcase *tc, FILE* ifp)
tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
tc->irs = (int *) malloc(tc->rslen*sizeof(int));
//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
tc->q = (char *) malloc(sizeof(char) * tc->rslen);
tc->i = (char *) malloc(sizeof(char) * tc->rslen);
tc->d = (char *) malloc(sizeof(char) * tc->rslen);
tc->c = (char *) malloc(sizeof(char) * tc->rslen);
for (x = 0; x < tc->rslen; x++)
{
@ -144,18 +199,22 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
tc->rs = new char[tokens[1].size()+2];
tc->rslen = tokens[1].size();
tc->q = new char[tc->rslen];
tc->i = new char[tc->rslen];
tc->d = new char[tc->rslen];
tc->c = new char[tc->rslen];
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
assert(tokens.size() == 2 + 4*(tc->rslen));
assert(tc->rslen < MROWS);
for(unsigned j=0;j<tc->rslen;++j)
tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]);
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]);
tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]);
tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]);
tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
if(reformat)
{
@ -184,3 +243,20 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
return tokens.size();
}
double getCurrClk() {
struct timeval tv ;
gettimeofday(&tv, NULL);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
uint64_t get_time(struct timespec* store_struct)
{
static struct timespec start_time;
struct timespec curr_time;
struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct;
clock_gettime(CLOCK_REALTIME, ptr);
uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec);
start_time = *ptr;
return diff_time;
}

View File

@ -0,0 +1,31 @@
#ifndef PAIRHMM_UTIL_H
#define PAIRHMM_UTIL_H
#include "template.h"
template<class T>
std::string to_string(T obj)
{
std::stringstream ss;
std::string ret_string;
ss.clear();
ss << std::scientific << obj;
ss >> ret_string;
ss.clear();
return ret_string;
}
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
bool is_avx_supported();
bool is_sse42_supported();
extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log);
extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log);
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
void initialize_function_pointers();
double getCurrClk();
uint64_t get_time(struct timespec* x=0);
#endif

View File

@ -0,0 +1,30 @@
#undef SIMD_TYPE
#undef SIMD_TYPE_AVX
#undef SIMD_TYPE_SSE
#define SIMD_TYPE avx
#define SIMD_TYPE_AVX
#include "define-float.h"
#include "vector_function_prototypes.h"
#include "define-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_TYPE
#undef SIMD_TYPE_AVX
#define SIMD_TYPE sse
#define SIMD_TYPE_SSE
#include "define-sse-float.h"
#include "vector_function_prototypes.h"
#include "define-sse-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_TYPE
#undef SIMD_TYPE_AVX
#undef SIMD_TYPE_SSE

View File

@ -0,0 +1,19 @@
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen);
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D);
template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(
int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM ,
_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256,
_256_TYPE distm, _256_TYPE _1_distm);
inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel);
template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);

View File

@ -1086,7 +1086,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
referenceConfidenceModel.close();
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
//TODO (open & close) at the walker, not the engine.
//likelihoodCalculationEngine.close();
likelihoodCalculationEngine.close();
logger.info("Ran local assembly on " + result + " active regions");
}

View File

@ -89,4 +89,6 @@ public interface LikelihoodCalculationEngine {
*/
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(AssemblyResultSet assemblyResultSet,
Map<String, List<GATKSAMRecord>> perSampleReadList);
public void close();
}

View File

@ -165,8 +165,10 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
}
}
@Override
public void close() {
if ( likelihoodsStream != null ) likelihoodsStream.close();
pairHMMThreadLocal.get().close();
}
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
@ -330,7 +332,12 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
}
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
pairHMMThreadLocal.get().initialize(haplotypes, perSampleReadList, X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
private void finalizePairHMM()
{
pairHMMThreadLocal.get().finalizeRegion();
}
@ -350,6 +357,9 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
stratifiedReadMap.put(sampleEntry.getKey(), map);
}
//Used mostly by the JNI implementation(s) to free arrays
finalizePairHMM();
return stratifiedReadMap;
}

View File

@ -57,6 +57,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
/**
@ -67,12 +68,13 @@ import java.util.Map;
public class JNILoglessPairHMM extends LoglessPairHMM {
private static final boolean debug = false; //simulates ifdef
private static final boolean verify = debug || false; //simulates ifdef
private static final boolean debug0_1 = false; //simulates ifdef
private static final boolean debug1 = false; //simulates ifdef
private static final boolean debug2 = false;
private static final boolean debug3 = false;
private int numComputeLikelihoodCalls = 0;
//Used to copy references to byteArrays to JNI from reads
protected class JNIReadDataHolderClass
{
@ -90,6 +92,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
}
private native void jniGlobalInit(Class<?> readDataHolderClass, Class<?> haplotypeDataHolderClass);
private native void jniClose();
private static boolean isJNILoglessPairHMMLibraryLoaded = false;
@ -104,6 +107,13 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
}
}
@Override
public void close()
{
jniClose();
debugClose();
}
//Used to test parts of the compute kernel separately
private native void jniInitialize(final int readMaxLength, final int haplotypeMaxLength);
private native static void jniInitializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP);
@ -126,7 +136,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
@Override
public void initialize(final int readMaxLength, final int haplotypeMaxLength)
{
if(debug)
if(verify)
super.initialize(readMaxLength, haplotypeMaxLength);
if(debug3)
{
@ -138,6 +148,42 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
jniInitialize(readMaxLength, haplotypeMaxLength);
}
private HashMap<Haplotype,Integer> haplotypeToHaplotypeListIdxMap = new HashMap<Haplotype,Integer>();
private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray);
//Used to transfer data to JNI
//Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region
/**
* {@inheritDoc}
*/
@Override
public void initialize( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) {
if(verify)
super.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);
int numHaplotypes = haplotypes.size();
JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes];
int idx = 0;
for(final Haplotype currHaplotype : haplotypes)
{
haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
haplotypeDataArray[idx].haplotypeBases = currHaplotype.getBases();
haplotypeToHaplotypeListIdxMap.put(currHaplotype, idx);
++idx;
}
jniInitializeHaplotypes(numHaplotypes, haplotypeDataArray);
}
private native void jniFinalizeRegion();
//Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not
//accessing Java memory directly, still important to release memory from C++
/**
* {@inheritDoc}
*/
@Override
public void finalizeRegion()
{
jniFinalizeRegion();
}
//Real compute kernel
private native void jniComputeLikelihoods(int numReads, int numHaplotypes,
JNIReadDataHolderClass[] readDataArray, JNIHaplotypeDataHolderClass[] haplotypeDataArray,
@ -149,19 +195,20 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap)
{
// (re)initialize the pairHMM only if necessary
final int readMaxLength = debug ? findMaxReadLength(reads) : 0;
final int haplotypeMaxLength = debug ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0;
if(debug)
final int readMaxLength = verify ? findMaxReadLength(reads) : 0;
final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0;
if(verify)
{
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
{ initialize(readMaxLength, haplotypeMaxLength); }
if ( ! initialized )
throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug mode");
throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode");
}
int readListSize = reads.size();
int alleleHaplotypeMapSize = alleleHaplotypeMap.size();
int numHaplotypes = alleleHaplotypeMap.size();
int numTestcases = readListSize*numHaplotypes;
if(debug0_1)
System.out.println("Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize);
System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize];
int idx = 0;
for(GATKSAMRecord read : reads)
@ -188,39 +235,68 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
}
++idx;
}
JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[alleleHaplotypeMapSize];
idx = 0;
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always
JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes];
if(verify)
{
haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
haplotypeDataArray[idx].haplotypeBases = currEntry.getValue().getBases();
if(debug0_1)
System.out.println("Java haplotype length "+haplotypeDataArray[idx].haplotypeBases.length);
if(debug3)
idx = 0;
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always
{
for(int i=0;i<haplotypeDataArray[idx].haplotypeBases.length;++i)
debugDump("haplotype_bases_java.txt",String.format("%d\n",(int)haplotypeDataArray[idx].haplotypeBases[i]),true);
haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
haplotypeDataArray[idx].haplotypeBases = currEntry.getValue().getBases();
if(debug0_1)
System.out.println("Java haplotype length "+haplotypeDataArray[idx].haplotypeBases.length);
if(debug3)
{
for(int i=0;i<haplotypeDataArray[idx].haplotypeBases.length;++i)
debugDump("haplotype_bases_java.txt",String.format("%d\n",(int)haplotypeDataArray[idx].haplotypeBases[i]),true);
}
++idx;
}
++idx;
}
double[] likelihoodArray = new double[readListSize*alleleHaplotypeMapSize]; //to store results
double[] likelihoodArray = new double[readListSize*numHaplotypes]; //to store results
//for(reads)
// for(haplotypes)
// compute_full_prob()
jniComputeLikelihoods(readListSize, alleleHaplotypeMapSize, readDataArray, haplotypeDataArray, likelihoodArray, 12);
jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, haplotypeDataArray, likelihoodArray, 12);
//final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
idx = 0;
int idxInsideHaplotypeList = 0;
int readIdx = 0;
for(GATKSAMRecord read : reads)
{
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
{
likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]);
//Since the order of haplotypes in the List<Haplotype> and alleleHaplotypeMap is different,
//get idx of current haplotype in the list and use this idx to get the right likelihoodValue
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[readIdx + idxInsideHaplotypeList]);
++idx;
}
if(debug)
readIdx += numHaplotypes;
}
if(verify)
{
//re-order values in likelihoodArray
double[] tmpArray = new double[numHaplotypes];
idx = 0;
idxInsideHaplotypeList = 0;
readIdx = 0;
for(GATKSAMRecord read : reads)
{
for(int j=0;j<numHaplotypes;++j)
tmpArray[j] = likelihoodArray[readIdx+j];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
{
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList];
++idx;
}
readIdx += numHaplotypes;
}
//to compare values
likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
//for floating point values, no exact equality
@ -246,23 +322,23 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
if(toDump)
{
idx = 0;
System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize);
System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
for(int i=0;i<readDataArray.length;++i)
{
for(int j=0;j<haplotypeDataArray.length;++j)
{
debugDump("debug_dump.log",new String(haplotypeDataArray[j].haplotypeBases)+" ",true);
debugDump("debug_dump.log",new String(readDataArray[i].readBases)+" ",true);
debugDump("debug_dump.txt",new String(haplotypeDataArray[j].haplotypeBases)+" ",true);
debugDump("debug_dump.txt",new String(readDataArray[i].readBases)+" ",true);
for(int k=0;k<readDataArray[i].readBases.length;++k)
debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].readQuals[k]))+" ",true);
debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].readQuals[k]))+" ",true);
for(int k=0;k<readDataArray[i].readBases.length;++k)
debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].insertionGOP[k]))+" ",true);
debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].insertionGOP[k]))+" ",true);
for(int k=0;k<readDataArray[i].readBases.length;++k)
debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].deletionGOP[k]))+" ",true);
debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].deletionGOP[k]))+" ",true);
for(int k=0;k<readDataArray[i].readBases.length;++k)
debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].overallGCP[k]))+" ",true);
debugDump("debug_dump.log","\n",true);
debugDump("debug_results.log",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].overallGCP[k]))+" ",true);
debugDump("debug_dump.txt","\n",true);
debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
++idx;
}
}
@ -271,10 +347,12 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
}
++numComputeLikelihoodCalls;
//if(numComputeLikelihoodCalls == 5)
//jniClose();
//System.exit(0);
return likelihoodMap;
}
/**
* {@inheritDoc}
*/
@ -296,11 +374,11 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
//}
//System.out.println("#### END STACK TRACE ####");
//
if(debug1)
jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length,
readBases, haplotypeBases, readQuals,
insertionGOP, deletionGOP, overallGCP,
hapStartIndex);
if(debug1)
jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length,
readBases, haplotypeBases, readQuals,
insertionGOP, deletionGOP, overallGCP,
hapStartIndex);
boolean doInitialization = (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length);
if (doInitialization) {
@ -355,8 +433,8 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
++numCalls;
//if(numCalls == 100)
//{
//debugClose();
//System.exit(0);
//debugClose();
//System.exit(0);
//}
return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10;
}

View File

@ -100,6 +100,26 @@ public abstract class PairHMM {
initialized = true;
}
/**
* Called at the end of PairHMM for a region - mostly used by the JNI implementations
*/
public void finalizeRegion()
{
;
}
/**
* Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
* This function is used by the JNI implementations to transfer all data once to the native code
* @param haplotypes the list of haplotypes
* @param perSampleReadList map from sample name to list of reads
* @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
* @param readMaxLength the max length of reads we want to use with this PairHMM
*/
public void initialize( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) {
initialize(readMaxLength, haplotypeMaxLength);
}
protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
int listMaxReadLength = 0;
for(GATKSAMRecord read : reads){
@ -280,4 +300,7 @@ public abstract class PairHMM {
return Math.min(haplotype1.length, haplotype2.length);
}
//Called at the end of all HC calls
public void close() { ; }
}