2014-01-19 03:07:23 +08:00
|
|
|
#include "headers.h"
|
2014-01-17 11:53:50 +08:00
|
|
|
#include "template.h"
|
2014-01-21 00:03:42 +08:00
|
|
|
#include "utils.h"
|
|
|
|
|
#include "vector_defs.h"
|
2014-01-19 03:07:23 +08:00
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
uint8_t ConvertChar::conversionTable[255];
|
2014-01-19 03:07:23 +08:00
|
|
|
float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
|
|
|
|
|
double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
|
|
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
using namespace std;
|
|
|
|
|
|
2014-01-19 03:07:23 +08:00
|
|
|
bool is_avx_supported()
|
|
|
|
|
{
|
|
|
|
|
int ecx = 0, edx = 0, ebx = 0;
|
|
|
|
|
__asm__("cpuid"
|
|
|
|
|
: "=b" (ebx),
|
|
|
|
|
"=c" (ecx),
|
|
|
|
|
"=d" (edx)
|
|
|
|
|
: "a" (1)
|
|
|
|
|
);
|
|
|
|
|
return ((ecx >> 28)&1) == 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool is_sse42_supported()
|
|
|
|
|
{
|
|
|
|
|
int ecx = 0, edx = 0, ebx = 0;
|
|
|
|
|
__asm__("cpuid"
|
|
|
|
|
: "=b" (ebx),
|
|
|
|
|
"=c" (ecx),
|
|
|
|
|
"=d" (edx)
|
|
|
|
|
: "a" (1)
|
|
|
|
|
);
|
|
|
|
|
return ((ecx >> 20)&1) == 1;
|
|
|
|
|
}
|
|
|
|
|
|
2014-01-25 08:29:35 +08:00
|
|
|
uint64_t get_machine_capabilities()
|
2014-01-21 00:03:42 +08:00
|
|
|
{
|
2014-01-25 08:29:35 +08:00
|
|
|
uint64_t machine_mask = 0ull;
|
2014-01-21 00:51:53 +08:00
|
|
|
if(is_avx_supported())
|
2014-01-25 08:29:35 +08:00
|
|
|
machine_mask |= (1 << AVX_CUSTOM_IDX);
|
|
|
|
|
if(is_sse42_supported())
|
|
|
|
|
machine_mask |= (1 << SSE42_CUSTOM_IDX);
|
|
|
|
|
return machine_mask;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void initialize_function_pointers(uint64_t mask)
|
|
|
|
|
{
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
//mask = 0ull;
|
2014-01-25 08:29:35 +08:00
|
|
|
if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX)))
|
2014-01-21 00:03:42 +08:00
|
|
|
{
|
|
|
|
|
cout << "Using AVX accelerated implementation of PairHMM\n";
|
|
|
|
|
g_compute_full_prob_float = compute_full_prob_avxs<float>;
|
|
|
|
|
g_compute_full_prob_double = compute_full_prob_avxd<double>;
|
|
|
|
|
}
|
|
|
|
|
else
|
2014-01-25 08:29:35 +08:00
|
|
|
if(is_sse42_supported() && (mask & (1<< SSE42_CUSTOM_IDX)))
|
2014-01-21 00:03:42 +08:00
|
|
|
{
|
|
|
|
|
cout << "Using SSE4.2 accelerated implementation of PairHMM\n";
|
|
|
|
|
g_compute_full_prob_float = compute_full_prob_sses<float>;
|
|
|
|
|
g_compute_full_prob_double = compute_full_prob_ssed<double>;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
cout << "Using un-vectorized C++ implementation of PairHMM\n";
|
|
|
|
|
g_compute_full_prob_float = compute_full_prob<float>;
|
|
|
|
|
g_compute_full_prob_double = compute_full_prob<double>;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
int normalize(char c)
|
|
|
|
|
{
|
|
|
|
|
return ((int) (c - 33));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int read_testcase(testcase *tc, FILE* ifp)
|
|
|
|
|
{
|
|
|
|
|
char *q, *i, *d, *c, *line = NULL;
|
|
|
|
|
int _q, _i, _d, _c;
|
|
|
|
|
int x, size = 0;
|
|
|
|
|
ssize_t read;
|
|
|
|
|
|
2014-02-07 01:32:56 +08:00
|
|
|
|
|
|
|
|
read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
|
2014-01-17 11:53:50 +08:00
|
|
|
if (read == -1)
|
2014-02-07 01:32:56 +08:00
|
|
|
{
|
|
|
|
|
free(line);
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
2014-01-17 11:53:50 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
tc->hap = (char *) malloc(size);
|
|
|
|
|
tc->rs = (char *) malloc(size);
|
|
|
|
|
q = (char *) malloc(size);
|
|
|
|
|
i = (char *) malloc(size);
|
|
|
|
|
d = (char *) malloc(size);
|
|
|
|
|
c = (char *) malloc(size);
|
|
|
|
|
|
|
|
|
|
if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2014-01-30 04:10:29 +08:00
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
tc->haplen = strlen(tc->hap);
|
|
|
|
|
tc->rslen = strlen(tc->rs);
|
2014-01-30 04:10:29 +08:00
|
|
|
assert(strlen(q) == tc->rslen);
|
|
|
|
|
assert(strlen(i) == tc->rslen);
|
|
|
|
|
assert(strlen(d) == tc->rslen);
|
|
|
|
|
assert(strlen(c) == tc->rslen);
|
2014-01-27 03:36:06 +08:00
|
|
|
//assert(tc->rslen < MROWS);
|
2014-02-05 08:27:29 +08:00
|
|
|
//tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
|
|
|
|
|
//tc->irs = (int *) malloc(tc->rslen*sizeof(int));
|
2014-01-17 11:53:50 +08:00
|
|
|
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->q = (char *) malloc(sizeof(char) * tc->rslen);
|
|
|
|
|
tc->i = (char *) malloc(sizeof(char) * tc->rslen);
|
|
|
|
|
tc->d = (char *) malloc(sizeof(char) * tc->rslen);
|
|
|
|
|
tc->c = (char *) malloc(sizeof(char) * tc->rslen);
|
2014-01-17 11:53:50 +08:00
|
|
|
|
|
|
|
|
for (x = 0; x < tc->rslen; x++)
|
|
|
|
|
{
|
|
|
|
|
_q = normalize(q[x]);
|
|
|
|
|
_i = normalize(i[x]);
|
|
|
|
|
_d = normalize(d[x]);
|
|
|
|
|
_c = normalize(c[x]);
|
2014-01-30 04:10:29 +08:00
|
|
|
tc->q[x] = (_q < 6) ? 6 : _q;
|
|
|
|
|
//tc->q[x] = _q;
|
2014-01-17 11:53:50 +08:00
|
|
|
tc->i[x] = _i;
|
|
|
|
|
tc->d[x] = _d;
|
|
|
|
|
tc->c[x] = _c;
|
2014-02-05 08:27:29 +08:00
|
|
|
//tc->irs[x] = tc->rs[x];
|
2014-01-17 11:53:50 +08:00
|
|
|
}
|
2014-02-05 08:27:29 +08:00
|
|
|
//for (x = 0; x < tc->haplen; x++)
|
|
|
|
|
//tc->ihap[x] = tc->hap[x];
|
2014-01-17 11:53:50 +08:00
|
|
|
|
2014-01-30 04:10:29 +08:00
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
free(q);
|
|
|
|
|
free(i);
|
|
|
|
|
free(d);
|
|
|
|
|
free(c);
|
|
|
|
|
free(line);
|
|
|
|
|
|
2014-01-30 04:10:29 +08:00
|
|
|
|
|
|
|
|
|
2014-01-17 11:53:50 +08:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned MAX_LINE_LENGTH = 65536;
|
|
|
|
|
int convToInt(std::string s)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
std::istringstream strin(s);
|
|
|
|
|
strin >> i;
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void tokenize(std::ifstream& fptr, std::vector<std::string>& tokens)
|
|
|
|
|
{
|
|
|
|
|
int i = 0;
|
|
|
|
|
std::string tmp;
|
|
|
|
|
std::vector<std::string> myVec;
|
|
|
|
|
vector<char> line;
|
|
|
|
|
line.clear();
|
|
|
|
|
line.resize(MAX_LINE_LENGTH);
|
|
|
|
|
vector<char> tmpline;
|
|
|
|
|
tmpline.clear();
|
|
|
|
|
tmpline.resize(MAX_LINE_LENGTH);
|
|
|
|
|
myVec.clear();
|
|
|
|
|
|
|
|
|
|
while(!fptr.eof())
|
|
|
|
|
{
|
|
|
|
|
i = 0;
|
|
|
|
|
bool still_read_line = true;
|
|
|
|
|
unsigned line_position = 0;
|
|
|
|
|
while(still_read_line)
|
|
|
|
|
{
|
|
|
|
|
fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH);
|
|
|
|
|
if(line_position + MAX_LINE_LENGTH > line.size())
|
|
|
|
|
line.resize(2*line.size());
|
|
|
|
|
for(unsigned i=0;i<MAX_LINE_LENGTH && tmpline[i] != '\0';++i,++line_position)
|
|
|
|
|
line[line_position] = tmpline[i];
|
|
|
|
|
if(fptr.eof() || !fptr.fail())
|
|
|
|
|
{
|
|
|
|
|
still_read_line = false;
|
|
|
|
|
line[line_position++] = '\0';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
std::istringstream kap(&(line[0]));
|
|
|
|
|
|
|
|
|
|
while(!kap.eof())
|
|
|
|
|
{
|
|
|
|
|
kap >> std::skipws >> tmp;
|
|
|
|
|
if(tmp != "")
|
|
|
|
|
{
|
|
|
|
|
myVec.push_back(tmp);
|
|
|
|
|
++i;
|
|
|
|
|
//std::cout <<tmp <<"#";
|
|
|
|
|
}
|
|
|
|
|
tmp = "";
|
|
|
|
|
}
|
|
|
|
|
//std::cout << "\n";
|
|
|
|
|
if(myVec.size() > 0)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
tokens.clear();
|
|
|
|
|
//std::cout << "Why "<<myVec.size()<<"\n";
|
|
|
|
|
tokens.resize(myVec.size());
|
|
|
|
|
for(i=0;i<(int)myVec.size();++i)
|
|
|
|
|
tokens[i] = myVec[i];
|
|
|
|
|
line.clear();
|
|
|
|
|
tmpline.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
|
|
|
|
|
{
|
|
|
|
|
static bool first_call = true;
|
|
|
|
|
vector<string> tokens;
|
|
|
|
|
tokens.clear();
|
|
|
|
|
tokenize(fptr, tokens);
|
|
|
|
|
if(tokens.size() == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
tc->hap = new char[tokens[0].size()+2];
|
|
|
|
|
tc->haplen = tokens[0].size();
|
|
|
|
|
memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
|
|
|
|
|
tc->rs = new char[tokens[1].size()+2];
|
|
|
|
|
tc->rslen = tokens[1].size();
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->q = new char[tc->rslen];
|
|
|
|
|
tc->i = new char[tc->rslen];
|
|
|
|
|
tc->d = new char[tc->rslen];
|
|
|
|
|
tc->c = new char[tc->rslen];
|
2014-01-17 11:53:50 +08:00
|
|
|
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
|
|
|
|
|
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
|
|
|
|
|
assert(tokens.size() == 2 + 4*(tc->rslen));
|
2014-01-27 03:36:06 +08:00
|
|
|
//assert(tc->rslen < MROWS);
|
2014-01-17 11:53:50 +08:00
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
|
2014-01-17 11:53:50 +08:00
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
|
2014-01-17 11:53:50 +08:00
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
|
2014-01-17 11:53:50 +08:00
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
2014-01-23 14:57:32 +08:00
|
|
|
tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
|
2014-01-17 11:53:50 +08:00
|
|
|
|
|
|
|
|
if(reformat)
|
|
|
|
|
{
|
|
|
|
|
ofstream ofptr;
|
|
|
|
|
ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app);
|
|
|
|
|
assert(ofptr.is_open());
|
|
|
|
|
ofptr << tokens[0] << " ";
|
|
|
|
|
ofptr << tokens[1] << " ";
|
|
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
|
|
|
ofptr << ((char)(tc->q[j]+33));
|
|
|
|
|
ofptr << " ";
|
|
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
|
|
|
ofptr << ((char)(tc->i[j]+33));
|
|
|
|
|
ofptr << " ";
|
|
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
|
|
|
ofptr << ((char)(tc->d[j]+33));
|
|
|
|
|
ofptr << " ";
|
|
|
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
|
|
|
ofptr << ((char)(tc->c[j]+33));
|
|
|
|
|
ofptr << " 0 false\n";
|
|
|
|
|
|
|
|
|
|
ofptr.close();
|
|
|
|
|
first_call = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return tokens.size();
|
|
|
|
|
}
|
2014-01-19 03:07:23 +08:00
|
|
|
|
2014-01-23 02:52:41 +08:00
|
|
|
double getCurrClk() {
|
|
|
|
|
struct timeval tv ;
|
|
|
|
|
gettimeofday(&tv, NULL);
|
|
|
|
|
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
|
|
|
|
}
|
2014-01-23 14:57:32 +08:00
|
|
|
|
|
|
|
|
uint64_t get_time(struct timespec* store_struct)
|
|
|
|
|
{
|
|
|
|
|
static struct timespec start_time;
|
|
|
|
|
struct timespec curr_time;
|
|
|
|
|
struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct;
|
|
|
|
|
clock_gettime(CLOCK_REALTIME, ptr);
|
|
|
|
|
uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec);
|
|
|
|
|
start_time = *ptr;
|
|
|
|
|
return diff_time;
|
|
|
|
|
}
|
2014-01-31 04:08:06 +08:00
|
|
|
|
|
|
|
|
uint64_t diff_time(struct timespec& prev_time)
|
|
|
|
|
{
|
|
|
|
|
struct timespec curr_time;
|
|
|
|
|
clock_gettime(CLOCK_REALTIME, &curr_time);
|
|
|
|
|
return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
|
|
|
|
|
}
|
|
|
|
|
|
2014-02-05 08:27:29 +08:00
|
|
|
//#define USE_PAPI
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
//#define COUNT_EXCEPTIONS
|
|
|
|
|
//#define CHECK_RESULTS
|
|
|
|
|
#define CHECK_UNDERFLOW 1
|
2014-02-05 08:27:29 +08:00
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
#include "papi.h"
|
|
|
|
|
#define NUM_PAPI_COUNTERS 4
|
|
|
|
|
#endif
|
2014-01-31 04:08:06 +08:00
|
|
|
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
IF_32 g_converter;
|
|
|
|
|
FILE* g_debug_fptr = 0;
|
2014-02-05 08:27:29 +08:00
|
|
|
uint64_t exceptions_array[128];
|
|
|
|
|
void do_compute(char* filename)
|
|
|
|
|
{
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
//g_debug_fptr = fopen("/mnt/app_hdd/scratch/karthikg/dump.log","w");
|
|
|
|
|
//assert(g_debug_fptr);
|
|
|
|
|
for(unsigned i=0;i<128;++i)
|
|
|
|
|
exceptions_array[i] = 0ull;
|
2014-02-05 08:27:29 +08:00
|
|
|
//assert(feenableexcept(FE_DIVBYZERO | FE_INVALID) >= 0);
|
|
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
PAPI_num_counters();
|
|
|
|
|
//int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L1_ICM, PAPI_L3_TCM, PAPI_TLB_DM, PAPI_TLB_IM };
|
|
|
|
|
//char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1d_misses", "l1i_misses", "l3_misses", "dtlb_misses", "itlb_misses" };
|
|
|
|
|
//long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
|
|
|
//long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
|
|
|
//int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_ICM };
|
|
|
|
|
//char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1i_misses"};
|
|
|
|
|
//assert(PAPI_event_name_to_code("PERF_COUNT_HW_STALLED_CYCLES_FRONTEND",&(events[2])) == PAPI_OK);
|
|
|
|
|
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
|
|
|
//assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL",&(events[2])) == PAPI_OK);
|
|
|
|
|
//assert(PAPI_event_name_to_code("MACHINE_CLEARS:e",&(events[3])) == PAPI_OK);
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "fp_assists", "idq_ms_cycles" };
|
2014-02-05 08:27:29 +08:00
|
|
|
assert(PAPI_event_name_to_code("ix86arch::INSTRUCTION_RETIRED",&(events[0])) == PAPI_OK);
|
|
|
|
|
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES",&(events[1])) == PAPI_OK);
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
assert(PAPI_event_name_to_code("FP_ASSIST:ANY", &(events[2])) == PAPI_OK);
|
|
|
|
|
assert(PAPI_event_name_to_code("IDQ:MS_UOPS_CYCLES", &(events[3])) == PAPI_OK);
|
2014-02-05 08:27:29 +08:00
|
|
|
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
|
|
|
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
|
|
|
|
|
|
|
|
#endif
|
2014-02-07 01:32:56 +08:00
|
|
|
#define BATCH_SIZE 10000
|
2014-02-05 08:27:29 +08:00
|
|
|
bool use_old_read_testcase = true;
|
|
|
|
|
unsigned chunk_size = 100;
|
|
|
|
|
std::ifstream ifptr;
|
|
|
|
|
FILE* fptr = 0;
|
|
|
|
|
if(use_old_read_testcase)
|
|
|
|
|
{
|
|
|
|
|
fptr = fopen(filename,"r");
|
|
|
|
|
if(fptr == 0)
|
|
|
|
|
cerr << "Could not open file "<<filename<<"\n";
|
|
|
|
|
assert(fptr);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
ifptr.open(filename);
|
|
|
|
|
assert(ifptr.is_open());
|
|
|
|
|
}
|
|
|
|
|
vector<testcase> tc_vector;
|
|
|
|
|
tc_vector.clear();
|
|
|
|
|
vector<double> results_vec;
|
|
|
|
|
results_vec.clear();
|
|
|
|
|
vector<double> baseline_results;
|
|
|
|
|
baseline_results.clear();
|
|
|
|
|
|
|
|
|
|
bool all_ok = true;
|
|
|
|
|
uint64_t total_time = 0;
|
|
|
|
|
uint64_t baseline_time = 0;
|
|
|
|
|
unsigned total_count = 0;
|
|
|
|
|
unsigned num_testcases = 0;
|
|
|
|
|
//unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
|
|
|
|
unsigned curr_batch_size = BATCH_SIZE;
|
|
|
|
|
|
|
|
|
|
testcase tc_in;
|
|
|
|
|
int break_value = 0;
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
uint64_t fp_single_exceptions_reexecute = 0;
|
|
|
|
|
uint64_t fp_single_exceptions_continue = 0;
|
|
|
|
|
uint64_t num_double_executions = 0;
|
2014-02-05 08:27:29 +08:00
|
|
|
while(1)
|
|
|
|
|
{
|
|
|
|
|
break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) :
|
|
|
|
|
read_mod_testcase(ifptr, &tc_in, true);
|
|
|
|
|
tc_vector.push_back(tc_in);
|
|
|
|
|
if(break_value >= 0)
|
|
|
|
|
++num_testcases;
|
|
|
|
|
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
|
|
|
|
|
{
|
|
|
|
|
results_vec.resize(tc_vector.size());
|
|
|
|
|
baseline_results.resize(tc_vector.size());
|
|
|
|
|
|
|
|
|
|
get_time();
|
|
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
|
|
|
#endif
|
|
|
|
|
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
|
|
|
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
|
|
|
{
|
|
|
|
|
double result = 0;
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifdef COUNT_EXCEPTIONS
|
|
|
|
|
fexcept_t flagp = 0;
|
|
|
|
|
feclearexcept(FE_ALL_EXCEPT | __FE_DENORM);
|
|
|
|
|
#endif
|
2014-02-05 08:27:29 +08:00
|
|
|
float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
//CONVERT_AND_PRINT(result_avxf);
|
|
|
|
|
#ifdef COUNT_EXCEPTIONS
|
|
|
|
|
STORE_FP_EXCEPTIONS(flagp, exceptions_array);
|
|
|
|
|
bool fp_exception = ((flagp & (FE_UNDERFLOW|FE_OVERFLOW|FE_INVALID)) != 0);
|
|
|
|
|
#endif
|
|
|
|
|
#ifdef CHECK_UNDERFLOW
|
|
|
|
|
if (result_avxf < MIN_ACCEPTED)
|
|
|
|
|
#else
|
2014-02-07 01:32:56 +08:00
|
|
|
if(false)
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#endif
|
2014-02-07 01:32:56 +08:00
|
|
|
{
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifdef COUNT_EXCEPTIONS
|
2014-02-07 01:32:56 +08:00
|
|
|
if(fp_exception)
|
|
|
|
|
++fp_single_exceptions_reexecute;
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#endif
|
2014-02-07 01:32:56 +08:00
|
|
|
double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
|
|
|
|
|
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
|
|
|
|
++num_double_executions;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifdef COUNT_EXCEPTIONS
|
2014-02-07 01:32:56 +08:00
|
|
|
if(fp_exception)
|
|
|
|
|
++fp_single_exceptions_continue;
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#endif
|
2014-02-07 01:32:56 +08:00
|
|
|
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
|
|
|
|
}
|
2014-02-05 08:27:29 +08:00
|
|
|
results_vec[i] = result;
|
|
|
|
|
}
|
|
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
//assert(PAPI_accum_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
|
|
|
assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
|
|
|
#endif
|
|
|
|
|
total_time += get_time();
|
|
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
|
|
|
|
|
accum_values[k] += values[k];
|
|
|
|
|
#endif
|
|
|
|
|
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifdef CHECK_RESULTS
|
2014-02-05 08:27:29 +08:00
|
|
|
#pragma omp parallel for schedule(dynamic,chunk_size)
|
|
|
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
|
|
|
{
|
|
|
|
|
testcase& tc = tc_vector[i];
|
|
|
|
|
float result_avxf = compute_full_prob<float>(&tc);
|
|
|
|
|
double result = 0;
|
|
|
|
|
if (result_avxf < MIN_ACCEPTED) {
|
|
|
|
|
double result_avxd = compute_full_prob<double>(&tc);
|
|
|
|
|
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
|
|
|
|
baseline_results[i] = result;
|
|
|
|
|
}
|
|
|
|
|
baseline_time += get_time();
|
|
|
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
|
|
|
{
|
|
|
|
|
double baseline_result = baseline_results[i];
|
|
|
|
|
double abs_error = fabs(baseline_result-results_vec[i]);
|
|
|
|
|
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
|
|
|
|
|
if(abs_error > 1e-5 && rel_error > 1e-5)
|
|
|
|
|
{
|
|
|
|
|
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
|
|
|
|
|
all_ok = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#else
|
|
|
|
|
all_ok = false;
|
2014-02-05 08:27:29 +08:00
|
|
|
#endif
|
|
|
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
|
|
|
{
|
2014-02-07 01:32:56 +08:00
|
|
|
free(tc_vector[i].rs);
|
|
|
|
|
free(tc_vector[i].hap);
|
|
|
|
|
free(tc_vector[i].q);
|
|
|
|
|
free(tc_vector[i].i);
|
|
|
|
|
free(tc_vector[i].d);
|
|
|
|
|
free(tc_vector[i].c);
|
2014-02-05 08:27:29 +08:00
|
|
|
}
|
|
|
|
|
total_count += num_testcases;
|
|
|
|
|
num_testcases = 0;
|
|
|
|
|
tc_vector.clear();
|
|
|
|
|
baseline_results.clear();
|
|
|
|
|
results_vec.clear();
|
|
|
|
|
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
|
|
|
|
curr_batch_size = BATCH_SIZE;
|
|
|
|
|
}
|
2014-02-07 01:32:56 +08:00
|
|
|
if(break_value < 0)
|
|
|
|
|
break;
|
2014-02-05 08:27:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
baseline_results.clear();
|
|
|
|
|
results_vec.clear();
|
|
|
|
|
tc_vector.clear();
|
|
|
|
|
if(all_ok)
|
|
|
|
|
cout << "All outputs acceptable\n";
|
|
|
|
|
#ifdef USE_PAPI
|
|
|
|
|
for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
|
|
|
|
|
cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
|
|
|
|
|
#endif
|
|
|
|
|
cout << "Total vector time "<< (total_time*1e-9) << " baseline time "<<baseline_time*1e-9<<"\n";
|
|
|
|
|
cout.flush();
|
|
|
|
|
fflush(stdout);
|
|
|
|
|
if(use_old_read_testcase)
|
|
|
|
|
fclose(fptr);
|
|
|
|
|
else
|
|
|
|
|
ifptr.close();
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifdef COUNT_EXCEPTIONS
|
|
|
|
|
cout << "Exceptions "
|
|
|
|
|
<<"invalid : "<<exceptions_array[FE_INVALID]<< " "
|
|
|
|
|
<<"denormal : "<<exceptions_array[__FE_DENORM]<< " "
|
|
|
|
|
<<"div_by_0 : "<<exceptions_array[FE_DIVBYZERO]<< " "
|
|
|
|
|
<<"overflow : "<<exceptions_array[FE_OVERFLOW]<< " "
|
|
|
|
|
<<"underflow : "<<exceptions_array[FE_UNDERFLOW]<< "\n";
|
|
|
|
|
cout << "Single precision FP exceptions continuations "<<fp_single_exceptions_continue<<" re-executions "<<fp_single_exceptions_reexecute<<"\n";
|
|
|
|
|
#endif
|
|
|
|
|
cout << "Num double executions "<<num_double_executions<<"\n";
|
2014-02-05 08:27:29 +08:00
|
|
|
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
//fclose(g_debug_fptr);
|
2014-02-05 08:27:29 +08:00
|
|
|
}
|