Includes code for all debug code for obtaining profiling info
This commit is contained in:
parent
5c7427e48c
commit
6d4d776633
|
|
@ -86,11 +86,13 @@ void LoadTimeInitializer::debug_close()
|
|||
m_filename_to_fptr.clear();
|
||||
}
|
||||
|
||||
void LoadTimeInitializer::dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc)
|
||||
void LoadTimeInitializer::dump_sandbox(testcase& tc)
|
||||
{
|
||||
unsigned haplotypeLength = tc.haplen;
|
||||
unsigned readLength = tc.rslen;
|
||||
ofstream& dumpFptr = m_sandbox_fptr;
|
||||
for(unsigned k=0;k<haplotypeLength;++k)
|
||||
dumpFptr<<(char)(haplotypeBasesArray[k]);
|
||||
dumpFptr<<(char)(tc.hap[k]);
|
||||
dumpFptr<<" ";
|
||||
for(unsigned k=0;k<readLength;++k)
|
||||
dumpFptr<<(char)(tc.rs[k]);
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ class LoadTimeInitializer
|
|||
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
|
||||
void debug_close();
|
||||
|
||||
void dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc);
|
||||
void dump_sandbox(testcase& tc);
|
||||
void open_sandbox() { m_sandbox_fptr.open("sandbox.txt", std::ios::app); }
|
||||
void close_sandbox() { m_sandbox_fptr.close(); }
|
||||
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS)
|
|||
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
libVectorLoglessPairHMM.so: $(LIBOBJECTS)
|
||||
$(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS} -Wl,-Bstatic -limf -lsvml -lirng -Wl,-Bdynamic #-lintlc
|
||||
$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
|
||||
|
||||
|
||||
$(OBJECTS): %.o: %.cc
|
||||
|
|
|
|||
|
|
@ -211,12 +211,29 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
|
||||
tc_array[tc_idx].rslen = (int)readLength;
|
||||
tc_array[tc_idx].haplen = (int)haplotypeLength;
|
||||
#if 0
|
||||
tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
|
||||
tc_array[tc_idx].rs = (char*)readBasesArray;
|
||||
tc_array[tc_idx].q = (char*)readQualsArray;
|
||||
tc_array[tc_idx].i = (char*)insertionGOPArray;
|
||||
tc_array[tc_idx].d = (char*)deletionGOPArray;
|
||||
tc_array[tc_idx].c = (char*)overallGCPArray;
|
||||
#endif
|
||||
//#define MEMCPY_HACK
|
||||
#ifdef MEMCPY_HACK
|
||||
tc_array[tc_idx].hap = new char[haplotypeLength];
|
||||
tc_array[tc_idx].rs = new char[readLength];
|
||||
tc_array[tc_idx].q = new char[readLength];
|
||||
tc_array[tc_idx].i = new char[readLength];
|
||||
tc_array[tc_idx].d = new char[readLength];
|
||||
tc_array[tc_idx].c = new char[readLength];
|
||||
memcpy(tc_array[tc_idx].hap, haplotypeBasesArray, haplotypeLength);
|
||||
memcpy(tc_array[tc_idx].rs, readBasesArray, readLength);
|
||||
memcpy(tc_array[tc_idx].q, readQualsArray, readLength);
|
||||
memcpy(tc_array[tc_idx].i, insertionGOPArray, readLength);
|
||||
memcpy(tc_array[tc_idx].d, deletionGOPArray, readLength);
|
||||
memcpy(tc_array[tc_idx].c, overallGCPArray, readLength);
|
||||
#endif
|
||||
#if 0
|
||||
tc_array[tc_idx].hap = (char*)all_arrays[0];
|
||||
tc_array[tc_idx].rs = (char*)all_arrays[1];
|
||||
|
|
@ -227,7 +244,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
#endif
|
||||
|
||||
#ifdef DUMP_TO_SANDBOX
|
||||
g_load_time_initializer.dump_sandbox(haplotypeLength, readLength, (char*)haplotypeBasesArray, tc_array[tc_idx]);
|
||||
g_load_time_initializer.dump_sandbox(tc_array[tc_idx]);
|
||||
#endif
|
||||
#ifdef DO_PROFILING
|
||||
g_load_time_initializer.m_sumProductReadLengthHaplotypeLength += (readLength*haplotypeLength);
|
||||
|
|
@ -262,7 +279,8 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
#endif
|
||||
#ifdef DO_PROFILING
|
||||
g_load_time_initializer.m_bytes_copied += (is_copy ? numTestCases*sizeof(double) : 0);
|
||||
start_time = get_time();
|
||||
struct timespec prev_time;
|
||||
clock_gettime(CLOCK_REALTIME, &prev_time);
|
||||
#endif
|
||||
#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse)
|
||||
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
|
||||
|
|
@ -291,7 +309,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
likelihoodDoubleArray[tc_idx] = result;
|
||||
}
|
||||
#ifdef DO_PROFILING
|
||||
g_load_time_initializer.m_compute_time += get_time();
|
||||
g_load_time_initializer.m_compute_time += diff_time(prev_time);
|
||||
#endif
|
||||
#ifdef DEBUG
|
||||
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
|
||||
|
|
@ -303,7 +321,19 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
|
|||
start_time = get_time();
|
||||
#endif
|
||||
RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory
|
||||
|
||||
|
||||
#ifdef MEMCPY_HACK
|
||||
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
|
||||
{
|
||||
delete tc_array[tc_idx].hap;
|
||||
delete tc_array[tc_idx].rs;
|
||||
delete tc_array[tc_idx].q;
|
||||
delete tc_array[tc_idx].i;
|
||||
delete tc_array[tc_idx].d;
|
||||
delete tc_array[tc_idx].c;
|
||||
}
|
||||
#endif
|
||||
|
||||
//Release read arrays first
|
||||
for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
|
||||
{
|
||||
|
|
|
|||
|
|
@ -8,14 +8,13 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
|
||||
#define BATCH_SIZE 5
|
||||
#define RUN_HYBRID
|
||||
|
||||
|
||||
|
||||
vector<double> results_vec;
|
||||
vector<testcase> tc_vector;
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
#define BATCH_SIZE 5
|
||||
if(argc < 2)
|
||||
{
|
||||
cerr << "Needs path to input file as argument\n";
|
||||
|
|
@ -41,16 +40,12 @@ int main(int argc, char** argv)
|
|||
assert(ifptr.is_open());
|
||||
}
|
||||
|
||||
vector<testcase> tc_vector;
|
||||
tc_vector.clear();
|
||||
tc_vector.resize(BATCH_SIZE+4);
|
||||
vector<double> results_vec;
|
||||
results_vec.clear();
|
||||
results_vec.resize(tc_vector.size());
|
||||
vector<double> baseline_results;
|
||||
baseline_results.clear();
|
||||
baseline_results.resize(tc_vector.size());
|
||||
|
||||
|
||||
bool all_ok = true;
|
||||
uint64_t total_time = 0;
|
||||
uint64_t baseline_time = 0;
|
||||
|
|
@ -58,76 +53,92 @@ int main(int argc, char** argv)
|
|||
unsigned num_testcases = 0;
|
||||
//unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
||||
unsigned curr_batch_size = BATCH_SIZE;
|
||||
uint64_t product = 0;
|
||||
|
||||
testcase tc_in;
|
||||
int break_value = 0;
|
||||
tc_vector.clear();
|
||||
while(1)
|
||||
{
|
||||
int break_value = use_old_read_testcase ? read_testcase(&(tc_vector[num_testcases]), fptr) :
|
||||
read_mod_testcase(ifptr,&(tc_vector[num_testcases]),true);
|
||||
break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) :
|
||||
read_mod_testcase(ifptr, &tc_in, true);
|
||||
tc_vector.push_back(tc_in);
|
||||
if(break_value >= 0)
|
||||
++num_testcases;
|
||||
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
|
||||
{
|
||||
get_time();
|
||||
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
testcase& tc = tc_vector[i];
|
||||
float result_avxf = g_compute_full_prob_float(&tc, 0);
|
||||
double result = 0;
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
double result_avxd = g_compute_full_prob_double(&tc, 0);
|
||||
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
||||
}
|
||||
else
|
||||
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
||||
results_vec[i] = result;
|
||||
}
|
||||
total_time += get_time();
|
||||
#pragma omp parallel for schedule(dynamic,chunk_size)
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
testcase& tc = tc_vector[i];
|
||||
float result_avxf = compute_full_prob<float>(&tc);
|
||||
double result = 0;
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
double result_avxd = compute_full_prob<double>(&tc);
|
||||
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
||||
}
|
||||
else
|
||||
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
||||
baseline_results[i] = result;
|
||||
}
|
||||
baseline_time += get_time();
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
double baseline_result = baseline_results[i];
|
||||
double abs_error = fabs(baseline_result-results_vec[i]);
|
||||
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
|
||||
if(abs_error > 1e-5 && rel_error > 1e-5)
|
||||
{
|
||||
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
|
||||
all_ok = false;
|
||||
}
|
||||
delete tc_vector[i].rs;
|
||||
delete tc_vector[i].hap;
|
||||
delete tc_vector[i].q;
|
||||
delete tc_vector[i].i;
|
||||
delete tc_vector[i].d;
|
||||
delete tc_vector[i].c;
|
||||
}
|
||||
total_count += num_testcases;
|
||||
num_testcases = 0;
|
||||
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
||||
curr_batch_size = BATCH_SIZE;
|
||||
}
|
||||
if(break_value < 0)
|
||||
break;
|
||||
}
|
||||
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
|
||||
{
|
||||
results_vec.resize(tc_vector.size());
|
||||
baseline_results.resize(tc_vector.size());
|
||||
|
||||
get_time();
|
||||
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
|
||||
double result = 0;
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
|
||||
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
||||
}
|
||||
else
|
||||
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
||||
results_vec[i] = result;
|
||||
product += (tc_vector[i].rslen*tc_vector[i].haplen);
|
||||
}
|
||||
total_time += get_time();
|
||||
#if 0
|
||||
#pragma omp parallel for schedule(dynamic,chunk_size)
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
testcase& tc = tc_vector[i];
|
||||
float result_avxf = compute_full_prob<float>(&tc);
|
||||
double result = 0;
|
||||
if (result_avxf < MIN_ACCEPTED) {
|
||||
double result_avxd = compute_full_prob<double>(&tc);
|
||||
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
||||
}
|
||||
else
|
||||
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
||||
baseline_results[i] = result;
|
||||
}
|
||||
baseline_time += get_time();
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
double baseline_result = baseline_results[i];
|
||||
double abs_error = fabs(baseline_result-results_vec[i]);
|
||||
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
|
||||
if(abs_error > 1e-5 && rel_error > 1e-5)
|
||||
{
|
||||
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
|
||||
all_ok = false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for(unsigned i=0;i<num_testcases;++i)
|
||||
{
|
||||
delete tc_vector[i].rs;
|
||||
delete tc_vector[i].hap;
|
||||
delete tc_vector[i].q;
|
||||
delete tc_vector[i].i;
|
||||
delete tc_vector[i].d;
|
||||
delete tc_vector[i].c;
|
||||
}
|
||||
total_count += num_testcases;
|
||||
num_testcases = 0;
|
||||
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
||||
curr_batch_size = BATCH_SIZE;
|
||||
}
|
||||
|
||||
baseline_results.clear();
|
||||
results_vec.clear();
|
||||
tc_vector.clear();
|
||||
if(all_ok)
|
||||
cout << "All outputs acceptable\n";
|
||||
cout << "Total vector time "<< ((double)total_time)/1e9 << " baseline time "<<baseline_time*1e-9<<"\n";
|
||||
cout << "Product "<<product<<"\n";
|
||||
cout.flush();
|
||||
fflush(stdout);
|
||||
if(use_old_read_testcase)
|
||||
|
|
|
|||
|
|
@ -278,3 +278,12 @@ uint64_t get_time(struct timespec* store_struct)
|
|||
start_time = *ptr;
|
||||
return diff_time;
|
||||
}
|
||||
|
||||
uint64_t diff_time(struct timespec& prev_time)
|
||||
{
|
||||
struct timespec curr_time;
|
||||
clock_gettime(CLOCK_REALTIME, &curr_time);
|
||||
return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ template<class NUMBER>
|
|||
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
|
||||
double getCurrClk();
|
||||
uint64_t get_time(struct timespec* x=0);
|
||||
uint64_t diff_time(struct timespec& prev_time);
|
||||
|
||||
//bit 0 is sse4.2, bit 1 is AVX
|
||||
enum ProcessorCapabilitiesEnum
|
||||
|
|
|
|||
Loading…
Reference in New Issue