1. Whew, finally debugged the source of performance issues with PairHMM

JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
This commit is contained in:
Karthik Gururaj 2014-02-05 17:09:57 -08:00
parent 24f8aef344
commit acda6ca27b
6 changed files with 87 additions and 27 deletions

View File

@ -7,6 +7,10 @@ LoadTimeInitializer g_load_time_initializer;
LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded
{
ConvertChar::init();
#ifndef DISABLE_FTZ
//Very important to get good performance - enable FTZ, converts denormals to 0
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif
m_sumNumReads = 0;
m_sumSquareNumReads = 0;
m_sumNumHaplotypes = 0;

View File

@ -13,15 +13,18 @@ CXX=icc
LDFLAGS=-lm -lrt $(OMPLDFLAGS)
#USE_PAPI=1
PAPI_DIR=/home/karthikg/softwares/papi-5.3.0
ifdef USE_PAPI
ifeq ($(USE_PAPI),1)
COMMON_COMPILATION_FLAGS+=-I$(PAPI_DIR)/include
COMMON_COMPILATION_FLAGS+=-I$(PAPI_DIR)/include -DUSE_PAPI
LDFLAGS+=-L$(PAPI_DIR)/lib -lpapi
endif
endif
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
endif
BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker
#BIN=checker

View File

@ -1,6 +1,7 @@
#include "headers.h"
#include "template.h"
extern uint64_t exceptions_array[128];
#include "utils.h"
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
{
@ -66,18 +67,22 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
distm = distm/3;
//feclearexcept(FE_ALL_EXCEPT);
//feclearexcept(FE_ALL_EXCEPT);
M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
//M[r][c] = (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//feclearexcept(FE_ALL_EXCEPT);
//feclearexcept(FE_ALL_EXCEPT);
X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//feclearexcept(FE_ALL_EXCEPT);
//feclearexcept(FE_ALL_EXCEPT);
Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//STORE_FP_EXCEPTIONS(flagp, exceptions_array);
//CONVERT_AND_PRINT(M[r][c]);
//CONVERT_AND_PRINT(X[r][c]);
//CONVERT_AND_PRINT(Y[r][c]);
}
for (c = 0; c < COLS; c++)

View File

@ -27,14 +27,19 @@
#include <cmath>
#include <fenv.h>
#define STORE_FP_EXCEPTIONS(flagp, exceptions_array) \
fegetexceptflag(&flagp, FE_OVERFLOW | FE_UNDERFLOW | FE_DIVBYZERO | FE_INVALID | __FE_DENORM); \
extern uint64_t exceptions_array[128];
extern FILE* g_debug_fptr;
#define STORE_FP_EXCEPTIONS(flagp, exceptions_array) \
fegetexceptflag(&flagp, FE_ALL_EXCEPT | __FE_DENORM); \
exceptions_array[FE_INVALID] += ((flagp & FE_INVALID)); \
exceptions_array[__FE_DENORM] += ((flagp & __FE_DENORM) >> 1); \
exceptions_array[FE_DIVBYZERO] += ((flagp & FE_DIVBYZERO) >> 2); \
exceptions_array[FE_OVERFLOW] += ((flagp & FE_OVERFLOW) >> 3); \
exceptions_array[FE_UNDERFLOW] += ((flagp & FE_UNDERFLOW) >> 4); \
feclearexcept(FE_ALL_EXCEPT);
feclearexcept(FE_ALL_EXCEPT | __FE_DENORM);
#define CONVERT_AND_PRINT(X) \
g_converter.f = (X); \
fwrite(&(g_converter.i),4,1,g_debug_fptr); \
#endif

View File

@ -45,7 +45,7 @@ uint64_t get_machine_capabilities()
void initialize_function_pointers(uint64_t mask)
{
//mask = 0;
//mask = 0ull;
if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX)))
{
cout << "Using AVX accelerated implementation of PairHMM\n";
@ -287,16 +287,23 @@ uint64_t diff_time(struct timespec& prev_time)
}
//#define USE_PAPI
//#define COUNT_EXCEPTIONS
//#define CHECK_RESULTS
#define CHECK_UNDERFLOW 1
#ifdef USE_PAPI
#include "papi.h"
#define NUM_PAPI_COUNTERS 4
#endif
IF_32 g_converter;
FILE* g_debug_fptr = 0;
uint64_t exceptions_array[128];
void do_compute(char* filename)
{
memset(exceptions_array, 0, 128*sizeof(uint64_t));
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
//g_debug_fptr = fopen("/mnt/app_hdd/scratch/karthikg/dump.log","w");
//assert(g_debug_fptr);
for(unsigned i=0;i<128;++i)
exceptions_array[i] = 0ull;
//assert(feenableexcept(FE_DIVBYZERO | FE_INVALID) >= 0);
#ifdef USE_PAPI
PAPI_num_counters();
@ -310,11 +317,11 @@ void do_compute(char* filename)
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
//assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL",&(events[2])) == PAPI_OK);
//assert(PAPI_event_name_to_code("MACHINE_CLEARS:e",&(events[3])) == PAPI_OK);
char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "ifetch_stall", "store_misses" };
char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "fp_assists", "idq_ms_cycles" };
assert(PAPI_event_name_to_code("ix86arch::INSTRUCTION_RETIRED",&(events[0])) == PAPI_OK);
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES",&(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("perf::L1-DCACHE-STORE-MISSES", &(events[3])) == PAPI_OK);
assert(PAPI_event_name_to_code("FP_ASSIST:ANY", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("IDQ:MS_UOPS_CYCLES", &(events[3])) == PAPI_OK);
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
@ -353,6 +360,9 @@ void do_compute(char* filename)
testcase tc_in;
int break_value = 0;
uint64_t fp_single_exceptions_reexecute = 0;
uint64_t fp_single_exceptions_continue = 0;
uint64_t num_double_executions = 0;
while(1)
{
break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) :
@ -373,13 +383,38 @@ void do_compute(char* filename)
for(unsigned i=0;i<num_testcases;++i)
{
double result = 0;
#ifdef COUNT_EXCEPTIONS
fexcept_t flagp = 0;
feclearexcept(FE_ALL_EXCEPT | __FE_DENORM);
#endif
float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
if (result_avxf < MIN_ACCEPTED) {
//CONVERT_AND_PRINT(result_avxf);
#ifdef COUNT_EXCEPTIONS
STORE_FP_EXCEPTIONS(flagp, exceptions_array);
bool fp_exception = ((flagp & (FE_UNDERFLOW|FE_OVERFLOW|FE_INVALID)) != 0);
#endif
#ifdef CHECK_UNDERFLOW
if (result_avxf < MIN_ACCEPTED)
#else
if(false)
#endif
{
#ifdef COUNT_EXCEPTIONS
if(fp_exception)
++fp_single_exceptions_reexecute;
#endif
double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
++num_double_executions;
}
else
{
#ifdef COUNT_EXCEPTIONS
if(fp_exception)
++fp_single_exceptions_continue;
#endif
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
}
results_vec[i] = result;
}
#ifdef USE_PAPI
@ -392,7 +427,7 @@ void do_compute(char* filename)
accum_values[k] += values[k];
#endif
#if 0
#ifdef CHECK_RESULTS
#pragma omp parallel for schedule(dynamic,chunk_size)
for(unsigned i=0;i<num_testcases;++i)
{
@ -419,6 +454,8 @@ void do_compute(char* filename)
all_ok = false;
}
}
#else
all_ok = false;
#endif
for(unsigned i=0;i<num_testcases;++i)
{
@ -457,11 +494,16 @@ void do_compute(char* filename)
fclose(fptr);
else
ifptr.close();
//cout << "Exceptions "<<exceptions_array[FE_INVALID]<< " "
//<<exceptions_array[__FE_DENORM]<< " "
//<<exceptions_array[FE_DIVBYZERO]<< " "
//<<exceptions_array[FE_OVERFLOW]<< " "
//<<exceptions_array[FE_UNDERFLOW]<< " "
//<<exceptions_array[FE_INEXACT]<< "\n";
#ifdef COUNT_EXCEPTIONS
cout << "Exceptions "
<<"invalid : "<<exceptions_array[FE_INVALID]<< " "
<<"denormal : "<<exceptions_array[__FE_DENORM]<< " "
<<"div_by_0 : "<<exceptions_array[FE_DIVBYZERO]<< " "
<<"overflow : "<<exceptions_array[FE_OVERFLOW]<< " "
<<"underflow : "<<exceptions_array[FE_UNDERFLOW]<< "\n";
cout << "Single precision FP exceptions continuations "<<fp_single_exceptions_continue<<" re-executions "<<fp_single_exceptions_reexecute<<"\n";
#endif
cout << "Num double executions "<<num_double_executions<<"\n";
//fclose(g_debug_fptr);
}

View File

@ -39,6 +39,7 @@ enum ProcessorCapabilitiesEnum
uint64_t get_machine_capabilities();
void initialize_function_pointers(uint64_t mask=ENABLE_ALL_HARDWARE_FEATURES);
extern IF_32 g_converter;
void do_compute(char* filename);
#endif