2014-03-04 01:04:00 +08:00
|
|
|
/*Copyright (c) 2012 The Broad Institute
|
|
|
|
|
|
|
|
|
|
*Permission is hereby granted, free of charge, to any person
|
|
|
|
|
*obtaining a copy of this software and associated documentation
|
|
|
|
|
*files (the "Software"), to deal in the Software without
|
|
|
|
|
*restriction, including without limitation the rights to use,
|
|
|
|
|
*copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
*copies of the Software, and to permit persons to whom the
|
|
|
|
|
*Software is furnished to do so, subject to the following
|
|
|
|
|
*conditions:
|
|
|
|
|
|
|
|
|
|
*The above copyright notice and this permission notice shall be
|
|
|
|
|
*included in all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
|
|
|
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
2014-01-23 02:52:41 +08:00
|
|
|
#include "utils.h"
|
2014-03-18 02:42:19 +08:00
|
|
|
#include "LoadTimeInitializer.h"
|
2014-01-23 02:52:41 +08:00
|
|
|
using namespace std;
|
2014-02-07 06:35:32 +08:00
|
|
|
char* LoadTimeInitializerStatsNames[] =
|
|
|
|
|
{
|
|
|
|
|
"num_regions",
|
|
|
|
|
"num_reads",
|
|
|
|
|
"num_haplotypes",
|
|
|
|
|
"num_testcases",
|
|
|
|
|
"num_double_invocations",
|
|
|
|
|
"haplotype_length",
|
|
|
|
|
"readlength",
|
|
|
|
|
"product_read_length_haplotype_length",
|
|
|
|
|
"dummy"
|
|
|
|
|
};
|
|
|
|
|
|
2014-01-23 02:52:41 +08:00
|
|
|
LoadTimeInitializer g_load_time_initializer;
|
|
|
|
|
|
|
|
|
|
LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded
|
|
|
|
|
{
|
2014-03-18 02:42:19 +08:00
|
|
|
#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__INTEL_COMPILER)
|
|
|
|
|
//compiles only with gcc >= 4.8
|
|
|
|
|
__builtin_cpu_init();
|
|
|
|
|
#endif
|
2014-01-23 02:52:41 +08:00
|
|
|
ConvertChar::init();
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#ifndef DISABLE_FTZ
|
2014-02-07 03:01:33 +08:00
|
|
|
//Very important to get good performance on Intel processors
|
|
|
|
|
//Function: enabling FTZ converts denormals to 0 in hardware
|
|
|
|
|
//Denormals cause microcode to insert uops into the core causing big slowdown
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
2014-05-07 01:59:24 +08:00
|
|
|
//cout << "FTZ enabled - may decrease accuracy if denormal numbers encountered\n";
|
2014-02-07 01:32:56 +08:00
|
|
|
#else
|
|
|
|
|
cout << "FTZ is not set - may slow down performance if denormal numbers encountered\n";
|
1. Whew, finally debugged the source of performance issues with PairHMM
JNI. See copied text from email below.
2. This commit contains all the code used in profiling, detecting FP
exceptions, dumping intermediate results. All flagged off using ifdefs,
but it's there.
--------------Text from email
As we discussed before, it's the denormal numbers that are causing the
slowdown - the core executes some microcode uops (called FP assists)
when denormal numbers are detected for FP operations (even un-vectorized
code).
The C++ compiler by default enables flush to zero (FTZ) - when set, the
hardware simply converts denormal numbers to 0. The Java binary
(executable provided by Oracle, not the native library) seems to be
compiled without FTZ (sensible choice, they want to be conservative).
Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++
slows down the C++ sandbox performance to the JNI version (fortunately,
the reverse also holds :)).
Not sure how to show the overhead for these FP assists easily - measured
a couple of counters.
FP_ASSISTS:ANY - shows number of uops executed as part of the FP
assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is
disabled this value is around 203540557 (both C++ and JNI)
IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing
uops when the microcode sequencing engine was busy. When FTZ is enabled,
this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled
this value is around 4.31B cycles (both C++ and JNI). This number is
still small with respect to total cycles (~40B), but it only reflects
the cycles in the decode stage. The total overhead of the microcode
assist ops could be larger.
As suggested by Mustafa, I compared intermediate values (matrices M,X,Y)
and final output of compute_full_prob. The values produced by C++ and
Java are identical to the last bit (as long as both use FTZ or no-FTZ).
Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ,
there are differences for very small values (denormal numbers).
Examples:
Diff values 1.952970E-33 1.952967E-33
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
Diff values 1.135071E-32 1.135070E-32
For this test case (low coverage NA12878), all these values would be
recomputed using the double precision version. Enabling FTZ should be
fine.
-------------------End text from email
2014-02-06 09:09:57 +08:00
|
|
|
#endif
|
2014-02-07 06:35:32 +08:00
|
|
|
//Profiling: times for compute and transfer (either bytes copied or pointers copied)
|
2014-01-23 02:52:41 +08:00
|
|
|
m_compute_time = 0;
|
|
|
|
|
m_data_transfer_time = 0;
|
2014-01-28 06:32:44 +08:00
|
|
|
m_bytes_copied = 0;
|
2014-01-23 02:52:41 +08:00
|
|
|
|
2014-02-07 06:35:32 +08:00
|
|
|
//Initialize profiling counters
|
|
|
|
|
for(unsigned i=0;i<TOTAL_NUMBER_STATS;++i)
|
|
|
|
|
{
|
|
|
|
|
m_sum_stats[i] = 0;
|
|
|
|
|
m_sum_square_stats[i] = 0;
|
|
|
|
|
m_max_stats[i] = 0;
|
|
|
|
|
m_min_stats[i] = 0xFFFFFFFFFFFFFFFFull;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//for debug dump
|
2014-01-23 02:52:41 +08:00
|
|
|
m_filename_to_fptr.clear();
|
2014-02-07 06:35:32 +08:00
|
|
|
m_written_files_set.clear();
|
2014-01-23 02:52:41 +08:00
|
|
|
|
|
|
|
|
initialize_function_pointers();
|
2014-02-26 13:44:20 +08:00
|
|
|
|
2014-03-01 08:08:28 +08:00
|
|
|
//Initialize static members of class
|
|
|
|
|
Context<float>::initializeStaticMembers();
|
|
|
|
|
Context<double>::initializeStaticMembers();
|
|
|
|
|
|
2014-01-23 02:52:41 +08:00
|
|
|
cout.flush();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void LoadTimeInitializer::print_profiling()
|
|
|
|
|
{
|
2014-02-07 06:35:32 +08:00
|
|
|
double mean = 0;
|
|
|
|
|
double variance = 0;
|
|
|
|
|
uint64_t denominator = 1;
|
|
|
|
|
cout << "Time spent in compute_testcases "<<m_compute_time*1e-9<<"\n";
|
|
|
|
|
cout << "Time spent in data transfer (Java <--> C++) "<<m_data_transfer_time*1e-9<<"\n";
|
|
|
|
|
|
|
|
|
|
cout << "\nHC input stats\nstat_name,sum,sum_square,mean,variance,min,max\n";
|
|
|
|
|
for(unsigned i=0;i<TOTAL_NUMBER_STATS;++i)
|
|
|
|
|
{
|
|
|
|
|
cout << LoadTimeInitializerStatsNames[i];
|
|
|
|
|
cout << "," << m_sum_stats[i];
|
|
|
|
|
cout << "," << std::scientific << m_sum_square_stats[i];
|
|
|
|
|
denominator = 1;
|
|
|
|
|
switch(i)
|
|
|
|
|
{
|
|
|
|
|
case NUM_READS_IDX:
|
|
|
|
|
case NUM_HAPLOTYPES_IDX:
|
|
|
|
|
case NUM_TESTCASES_IDX:
|
|
|
|
|
denominator = m_sum_stats[NUM_REGIONS_IDX];
|
|
|
|
|
break;
|
|
|
|
|
case HAPLOTYPE_LENGTH_IDX:
|
2014-02-11 02:11:03 +08:00
|
|
|
denominator = m_sum_stats[NUM_HAPLOTYPES_IDX];
|
|
|
|
|
break;
|
2014-02-07 06:35:32 +08:00
|
|
|
case READ_LENGTH_IDX:
|
2014-02-11 02:11:03 +08:00
|
|
|
denominator = m_sum_stats[NUM_READS_IDX];
|
|
|
|
|
break;
|
2014-02-07 06:35:32 +08:00
|
|
|
case PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX:
|
|
|
|
|
denominator = m_sum_stats[NUM_TESTCASES_IDX];
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
denominator = 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
mean = ((double)m_sum_stats[i])/denominator;
|
|
|
|
|
cout << "," << std::scientific << mean;
|
|
|
|
|
variance = (m_sum_square_stats[i]/denominator) - (mean*mean); //E(X^2)-(E(X))^2
|
|
|
|
|
cout << "," << std::scientific << variance;
|
|
|
|
|
cout << "," << m_min_stats[i];
|
|
|
|
|
cout << "," << m_max_stats[i];
|
|
|
|
|
cout << "\n";
|
|
|
|
|
}
|
|
|
|
|
cout << "\n";
|
2014-01-23 02:52:41 +08:00
|
|
|
cout.flush();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void LoadTimeInitializer::debug_dump(string filename, string s, bool to_append, bool add_newline)
|
|
|
|
|
{
|
|
|
|
|
map<string, ofstream*>::iterator mI = m_filename_to_fptr.find(filename);
|
|
|
|
|
ofstream* fptr = 0;
|
|
|
|
|
if(mI == m_filename_to_fptr.end())
|
|
|
|
|
{
|
|
|
|
|
m_filename_to_fptr[filename] = new ofstream();
|
|
|
|
|
fptr = m_filename_to_fptr[filename];
|
2014-02-07 06:35:32 +08:00
|
|
|
//File never seen before
|
|
|
|
|
if(m_written_files_set.find(filename) == m_written_files_set.end())
|
|
|
|
|
{
|
|
|
|
|
to_append = false;
|
|
|
|
|
m_written_files_set.insert(filename);
|
|
|
|
|
}
|
2014-01-23 02:52:41 +08:00
|
|
|
fptr->open(filename.c_str(), to_append ? ios::app : ios::out);
|
|
|
|
|
assert(fptr->is_open());
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
fptr = (*mI).second;
|
|
|
|
|
//ofstream fptr;
|
|
|
|
|
//fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
|
|
|
|
|
(*fptr) << s;
|
|
|
|
|
if(add_newline)
|
|
|
|
|
(*fptr) << "\n";
|
|
|
|
|
//fptr.close();
|
|
|
|
|
}
|
|
|
|
|
void LoadTimeInitializer::debug_close()
|
|
|
|
|
{
|
|
|
|
|
for(map<string,ofstream*>::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end();
|
|
|
|
|
mB != mE;mB++)
|
|
|
|
|
{
|
|
|
|
|
(*mB).second->close();
|
|
|
|
|
delete (*mB).second;
|
|
|
|
|
}
|
|
|
|
|
m_filename_to_fptr.clear();
|
|
|
|
|
}
|
|
|
|
|
|
2014-02-05 08:27:29 +08:00
|
|
|
void LoadTimeInitializer::dump_sandbox(testcase& tc, unsigned tc_idx, unsigned numReads, unsigned numHaplotypes)
|
2014-01-30 04:10:29 +08:00
|
|
|
{
|
2014-01-31 04:08:06 +08:00
|
|
|
unsigned haplotypeLength = tc.haplen;
|
|
|
|
|
unsigned readLength = tc.rslen;
|
2014-01-30 04:10:29 +08:00
|
|
|
ofstream& dumpFptr = m_sandbox_fptr;
|
|
|
|
|
for(unsigned k=0;k<haplotypeLength;++k)
|
2014-01-31 04:08:06 +08:00
|
|
|
dumpFptr<<(char)(tc.hap[k]);
|
2014-01-30 04:10:29 +08:00
|
|
|
dumpFptr<<" ";
|
|
|
|
|
for(unsigned k=0;k<readLength;++k)
|
|
|
|
|
dumpFptr<<(char)(tc.rs[k]);
|
|
|
|
|
dumpFptr<<" ";
|
|
|
|
|
for(unsigned k=0;k<readLength;++k)
|
|
|
|
|
dumpFptr<<(char)(tc.q[k]+33);
|
|
|
|
|
dumpFptr<<" ";
|
|
|
|
|
for(unsigned k=0;k<readLength;++k)
|
|
|
|
|
dumpFptr<<(char)(tc.i[k]+33);
|
|
|
|
|
dumpFptr<<" ";
|
|
|
|
|
for(unsigned k=0;k<readLength;++k)
|
|
|
|
|
dumpFptr<<(char)(tc.d[k]+33);
|
|
|
|
|
dumpFptr<<" ";
|
|
|
|
|
for(unsigned k=0;k<readLength;++k)
|
|
|
|
|
dumpFptr<<(char)(tc.c[k]+33);
|
2014-02-05 08:27:29 +08:00
|
|
|
if(tc_idx == 0) //new region
|
|
|
|
|
dumpFptr << " "<< numReads << " "<<numHaplotypes;
|
2014-01-30 04:10:29 +08:00
|
|
|
dumpFptr<<"\n";
|
|
|
|
|
}
|
2014-02-07 06:35:32 +08:00
|
|
|
|
|
|
|
|
void LoadTimeInitializer::update_stat(LoadTimeInitializerStatsEnum stat_idx, uint64_t value)
|
|
|
|
|
{
|
|
|
|
|
m_sum_stats[stat_idx] += value;
|
|
|
|
|
double v = value;
|
|
|
|
|
m_sum_square_stats[stat_idx] += (v*v);
|
|
|
|
|
m_max_stats[stat_idx] = std::max(m_max_stats[stat_idx], value);
|
|
|
|
|
m_min_stats[stat_idx] = std::min(m_min_stats[stat_idx], value);
|
|
|
|
|
}
|