Includes code for all debug code for obtaining profiling info

This commit is contained in:
Karthik Gururaj 2014-01-30 12:08:06 -08:00
parent 5c7427e48c
commit 6d4d776633
7 changed files with 127 additions and 74 deletions

View File

@ -86,11 +86,13 @@ void LoadTimeInitializer::debug_close()
m_filename_to_fptr.clear();
}
void LoadTimeInitializer::dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc)
void LoadTimeInitializer::dump_sandbox(testcase& tc)
{
unsigned haplotypeLength = tc.haplen;
unsigned readLength = tc.rslen;
ofstream& dumpFptr = m_sandbox_fptr;
for(unsigned k=0;k<haplotypeLength;++k)
dumpFptr<<(char)(haplotypeBasesArray[k]);
dumpFptr<<(char)(tc.hap[k]);
dumpFptr<<" ";
for(unsigned k=0;k<readLength;++k)
dumpFptr<<(char)(tc.rs[k]);

View File

@ -11,7 +11,7 @@ class LoadTimeInitializer
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
void debug_close();
void dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc);
void dump_sandbox(testcase& tc);
void open_sandbox() { m_sandbox_fptr.open("sandbox.txt", std::ios::app); }
void close_sandbox() { m_sandbox_fptr.close(); }

View File

@ -54,7 +54,7 @@ pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
libVectorLoglessPairHMM.so: $(LIBOBJECTS)
$(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS} -Wl,-Bstatic -limf -lsvml -lirng -Wl,-Bdynamic #-lintlc
$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
$(OBJECTS): %.o: %.cc

View File

@ -211,12 +211,29 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
tc_array[tc_idx].rslen = (int)readLength;
tc_array[tc_idx].haplen = (int)haplotypeLength;
#if 0
tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
tc_array[tc_idx].rs = (char*)readBasesArray;
tc_array[tc_idx].q = (char*)readQualsArray;
tc_array[tc_idx].i = (char*)insertionGOPArray;
tc_array[tc_idx].d = (char*)deletionGOPArray;
tc_array[tc_idx].c = (char*)overallGCPArray;
#endif
//#define MEMCPY_HACK
#ifdef MEMCPY_HACK
tc_array[tc_idx].hap = new char[haplotypeLength];
tc_array[tc_idx].rs = new char[readLength];
tc_array[tc_idx].q = new char[readLength];
tc_array[tc_idx].i = new char[readLength];
tc_array[tc_idx].d = new char[readLength];
tc_array[tc_idx].c = new char[readLength];
memcpy(tc_array[tc_idx].hap, haplotypeBasesArray, haplotypeLength);
memcpy(tc_array[tc_idx].rs, readBasesArray, readLength);
memcpy(tc_array[tc_idx].q, readQualsArray, readLength);
memcpy(tc_array[tc_idx].i, insertionGOPArray, readLength);
memcpy(tc_array[tc_idx].d, deletionGOPArray, readLength);
memcpy(tc_array[tc_idx].c, overallGCPArray, readLength);
#endif
#if 0
tc_array[tc_idx].hap = (char*)all_arrays[0];
tc_array[tc_idx].rs = (char*)all_arrays[1];
@ -227,7 +244,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
#endif
#ifdef DUMP_TO_SANDBOX
g_load_time_initializer.dump_sandbox(haplotypeLength, readLength, (char*)haplotypeBasesArray, tc_array[tc_idx]);
g_load_time_initializer.dump_sandbox(tc_array[tc_idx]);
#endif
#ifdef DO_PROFILING
g_load_time_initializer.m_sumProductReadLengthHaplotypeLength += (readLength*haplotypeLength);
@ -262,7 +279,8 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
#endif
#ifdef DO_PROFILING
g_load_time_initializer.m_bytes_copied += (is_copy ? numTestCases*sizeof(double) : 0);
start_time = get_time();
struct timespec prev_time;
clock_gettime(CLOCK_REALTIME, &prev_time);
#endif
#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse)
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
@ -291,7 +309,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
likelihoodDoubleArray[tc_idx] = result;
}
#ifdef DO_PROFILING
g_load_time_initializer.m_compute_time += get_time();
g_load_time_initializer.m_compute_time += diff_time(prev_time);
#endif
#ifdef DEBUG
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
@ -303,7 +321,19 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
start_time = get_time();
#endif
RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory
#ifdef MEMCPY_HACK
for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
{
delete tc_array[tc_idx].hap;
delete tc_array[tc_idx].rs;
delete tc_array[tc_idx].q;
delete tc_array[tc_idx].i;
delete tc_array[tc_idx].d;
delete tc_array[tc_idx].c;
}
#endif
//Release read arrays first
for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
{

View File

@ -8,14 +8,13 @@
using namespace std;
#define BATCH_SIZE 5
#define RUN_HYBRID
vector<double> results_vec;
vector<testcase> tc_vector;
int main(int argc, char** argv)
{
#define BATCH_SIZE 5
if(argc < 2)
{
cerr << "Needs path to input file as argument\n";
@ -41,16 +40,12 @@ int main(int argc, char** argv)
assert(ifptr.is_open());
}
vector<testcase> tc_vector;
tc_vector.clear();
tc_vector.resize(BATCH_SIZE+4);
vector<double> results_vec;
results_vec.clear();
results_vec.resize(tc_vector.size());
vector<double> baseline_results;
baseline_results.clear();
baseline_results.resize(tc_vector.size());
bool all_ok = true;
uint64_t total_time = 0;
uint64_t baseline_time = 0;
@ -58,76 +53,92 @@ int main(int argc, char** argv)
unsigned num_testcases = 0;
//unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
unsigned curr_batch_size = BATCH_SIZE;
uint64_t product = 0;
testcase tc_in;
int break_value = 0;
tc_vector.clear();
while(1)
{
int break_value = use_old_read_testcase ? read_testcase(&(tc_vector[num_testcases]), fptr) :
read_mod_testcase(ifptr,&(tc_vector[num_testcases]),true);
break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) :
read_mod_testcase(ifptr, &tc_in, true);
tc_vector.push_back(tc_in);
if(break_value >= 0)
++num_testcases;
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
{
get_time();
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
for(unsigned i=0;i<num_testcases;++i)
{
testcase& tc = tc_vector[i];
float result_avxf = g_compute_full_prob_float(&tc, 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&tc, 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
results_vec[i] = result;
}
total_time += get_time();
#pragma omp parallel for schedule(dynamic,chunk_size)
for(unsigned i=0;i<num_testcases;++i)
{
testcase& tc = tc_vector[i];
float result_avxf = compute_full_prob<float>(&tc);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = compute_full_prob<double>(&tc);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
baseline_results[i] = result;
}
baseline_time += get_time();
for(unsigned i=0;i<num_testcases;++i)
{
double baseline_result = baseline_results[i];
double abs_error = fabs(baseline_result-results_vec[i]);
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
if(abs_error > 1e-5 && rel_error > 1e-5)
{
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
all_ok = false;
}
delete tc_vector[i].rs;
delete tc_vector[i].hap;
delete tc_vector[i].q;
delete tc_vector[i].i;
delete tc_vector[i].d;
delete tc_vector[i].c;
}
total_count += num_testcases;
num_testcases = 0;
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
curr_batch_size = BATCH_SIZE;
}
if(break_value < 0)
break;
}
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
{
results_vec.resize(tc_vector.size());
baseline_results.resize(tc_vector.size());
get_time();
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
for(unsigned i=0;i<num_testcases;++i)
{
float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
results_vec[i] = result;
product += (tc_vector[i].rslen*tc_vector[i].haplen);
}
total_time += get_time();
#if 0
#pragma omp parallel for schedule(dynamic,chunk_size)
for(unsigned i=0;i<num_testcases;++i)
{
testcase& tc = tc_vector[i];
float result_avxf = compute_full_prob<float>(&tc);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = compute_full_prob<double>(&tc);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
baseline_results[i] = result;
}
baseline_time += get_time();
for(unsigned i=0;i<num_testcases;++i)
{
double baseline_result = baseline_results[i];
double abs_error = fabs(baseline_result-results_vec[i]);
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
if(abs_error > 1e-5 && rel_error > 1e-5)
{
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
all_ok = false;
}
}
#endif
for(unsigned i=0;i<num_testcases;++i)
{
delete tc_vector[i].rs;
delete tc_vector[i].hap;
delete tc_vector[i].q;
delete tc_vector[i].i;
delete tc_vector[i].d;
delete tc_vector[i].c;
}
total_count += num_testcases;
num_testcases = 0;
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
curr_batch_size = BATCH_SIZE;
}
baseline_results.clear();
results_vec.clear();
tc_vector.clear();
if(all_ok)
cout << "All outputs acceptable\n";
cout << "Total vector time "<< ((double)total_time)/1e9 << " baseline time "<<baseline_time*1e-9<<"\n";
cout << "Product "<<product<<"\n";
cout.flush();
fflush(stdout);
if(use_old_read_testcase)

View File

@ -278,3 +278,12 @@ uint64_t get_time(struct timespec* store_struct)
start_time = *ptr;
return diff_time;
}
uint64_t diff_time(struct timespec& prev_time)
{
struct timespec curr_time;
clock_gettime(CLOCK_REALTIME, &curr_time);
return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
}

View File

@ -27,6 +27,7 @@ template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
double getCurrClk();
uint64_t get_time(struct timespec* x=0);
uint64_t diff_time(struct timespec& prev_time);
//bit 0 is sse4.2, bit 1 is AVX
enum ProcessorCapabilitiesEnum