Includes code for all debug code for obtaining profiling info

2014-01-30 12:08:06 -08:00 · 2014-01-30 12:08:06 -08:00 · 6d4d776633
parent 5c7427e48c
commit 6d4d776633
7 changed files with 127 additions and 74 deletions
--- a/public/c++/VectorPairHMM/LoadTimeInitializer.cc
+++ b/public/c++/VectorPairHMM/LoadTimeInitializer.cc
@ -86,11 +86,13 @@ void LoadTimeInitializer::debug_close()
  m_filename_to_fptr.clear();
 }

-void LoadTimeInitializer::dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc)
+void LoadTimeInitializer::dump_sandbox(testcase& tc)
 {
+  unsigned haplotypeLength = tc.haplen;
+  unsigned readLength = tc.rslen;
  ofstream& dumpFptr = m_sandbox_fptr;
  for(unsigned k=0;k<haplotypeLength;++k)
-    dumpFptr<<(char)(haplotypeBasesArray[k]);
+    dumpFptr<<(char)(tc.hap[k]);
  dumpFptr<<" ";
  for(unsigned k=0;k<readLength;++k)
    dumpFptr<<(char)(tc.rs[k]);
--- a/public/c++/VectorPairHMM/LoadTimeInitializer.h
+++ b/public/c++/VectorPairHMM/LoadTimeInitializer.h
@ -11,7 +11,7 @@ class LoadTimeInitializer
    void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
    void debug_close();
    
-    void dump_sandbox(unsigned haplotypeLength, unsigned readLength, char* haplotypeBasesArray, testcase& tc);
+    void dump_sandbox(testcase& tc);
    void open_sandbox() { m_sandbox_fptr.open("sandbox.txt", std::ios::app); }
    void close_sandbox() { m_sandbox_fptr.close(); }
    
--- a/public/c++/VectorPairHMM/Makefile
+++ b/public/c++/VectorPairHMM/Makefile
@ -54,7 +54,7 @@ pairhmm-template-main:	pairhmm-template-main.o $(COMMON_OBJECTS)
 	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)

 libVectorLoglessPairHMM.so: $(LIBOBJECTS) 
-	$(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS} -Wl,-Bstatic -limf -lsvml -lirng -Wl,-Bdynamic #-lintlc
+	$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}


 $(OBJECTS): %.o: %.cc
--- a/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
+++ b/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
@ -211,12 +211,29 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
      jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
      tc_array[tc_idx].rslen = (int)readLength;
      tc_array[tc_idx].haplen = (int)haplotypeLength;
+#if 0
      tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
      tc_array[tc_idx].rs = (char*)readBasesArray;
      tc_array[tc_idx].q = (char*)readQualsArray;
      tc_array[tc_idx].i = (char*)insertionGOPArray;
      tc_array[tc_idx].d = (char*)deletionGOPArray;
      tc_array[tc_idx].c = (char*)overallGCPArray;
+#endif
+      //#define MEMCPY_HACK
+#ifdef MEMCPY_HACK
+      tc_array[tc_idx].hap = new char[haplotypeLength];
+      tc_array[tc_idx].rs = new char[readLength];
+      tc_array[tc_idx].q = new char[readLength];
+      tc_array[tc_idx].i = new char[readLength];
+      tc_array[tc_idx].d = new char[readLength];
+      tc_array[tc_idx].c = new char[readLength];
+      memcpy(tc_array[tc_idx].hap, haplotypeBasesArray, haplotypeLength);
+      memcpy(tc_array[tc_idx].rs, readBasesArray, readLength);
+      memcpy(tc_array[tc_idx].q,  readQualsArray, readLength);
+      memcpy(tc_array[tc_idx].i,  insertionGOPArray, readLength);
+      memcpy(tc_array[tc_idx].d,  deletionGOPArray, readLength);
+      memcpy(tc_array[tc_idx].c,  overallGCPArray, readLength);
+#endif
 #if 0 
      tc_array[tc_idx].hap = (char*)all_arrays[0];
      tc_array[tc_idx].rs = (char*)all_arrays[1];
@ -227,7 +244,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
 #endif

 #ifdef DUMP_TO_SANDBOX
-      g_load_time_initializer.dump_sandbox(haplotypeLength, readLength, (char*)haplotypeBasesArray, tc_array[tc_idx]);
+      g_load_time_initializer.dump_sandbox(tc_array[tc_idx]);
 #endif
 #ifdef DO_PROFILING
      g_load_time_initializer.m_sumProductReadLengthHaplotypeLength += (readLength*haplotypeLength);
@ -262,7 +279,8 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
 #endif
 #ifdef DO_PROFILING
  g_load_time_initializer.m_bytes_copied += (is_copy ? numTestCases*sizeof(double) : 0);
-  start_time = get_time();
+  struct timespec prev_time;
+  clock_gettime(CLOCK_REALTIME, &prev_time);
 #endif
 #pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse)
  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
@ -291,7 +309,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
    likelihoodDoubleArray[tc_idx] = result;
  }
 #ifdef DO_PROFILING
-  g_load_time_initializer.m_compute_time += get_time();
+  g_load_time_initializer.m_compute_time += diff_time(prev_time);
 #endif
 #ifdef DEBUG
  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
@ -303,7 +321,19 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
  start_time = get_time();
 #endif
  RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory
-  
+
+#ifdef MEMCPY_HACK
+  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
+  {
+    delete tc_array[tc_idx].hap;
+    delete tc_array[tc_idx].rs;
+    delete tc_array[tc_idx].q;
+    delete tc_array[tc_idx].i;
+    delete tc_array[tc_idx].d;
+    delete tc_array[tc_idx].c;
+  }
+#endif
+
  //Release read arrays first
  for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
  {
--- a/public/c++/VectorPairHMM/pairhmm-1-base.cc
+++ b/public/c++/VectorPairHMM/pairhmm-1-base.cc
@ -8,14 +8,13 @@

 using namespace std;

-
-#define BATCH_SIZE 5
 #define RUN_HYBRID

-
-
+vector<double> results_vec;
+vector<testcase> tc_vector;
 int main(int argc, char** argv)
 {
+#define BATCH_SIZE 5
  if(argc < 2)
  {
    cerr << "Needs path to input file as argument\n";
@ -41,16 +40,12 @@ int main(int argc, char** argv)
    assert(ifptr.is_open());
  }

-  vector<testcase> tc_vector;
  tc_vector.clear();
  tc_vector.resize(BATCH_SIZE+4);
-  vector<double> results_vec;
  results_vec.clear();
-  results_vec.resize(tc_vector.size());
  vector<double> baseline_results;
  baseline_results.clear();
-  baseline_results.resize(tc_vector.size());
-  
+    
  bool all_ok = true;
  uint64_t total_time = 0;
  uint64_t baseline_time = 0;
@ -58,76 +53,92 @@ int main(int argc, char** argv)
  unsigned num_testcases = 0;
  //unsigned curr_batch_size = rand()%BATCH_SIZE + 4;     //min batch size
  unsigned curr_batch_size = BATCH_SIZE;
+  uint64_t product = 0;
+
+  testcase tc_in;
+  int break_value = 0;
+  tc_vector.clear();
  while(1)
  {
-    int break_value = use_old_read_testcase ? read_testcase(&(tc_vector[num_testcases]), fptr) : 
-      read_mod_testcase(ifptr,&(tc_vector[num_testcases]),true);
+    break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) : 
+      read_mod_testcase(ifptr, &tc_in, true);
+    tc_vector.push_back(tc_in);
    if(break_value >= 0)
      ++num_testcases;
-    if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
-    {
-      get_time();
-#pragma omp parallel for schedule(dynamic,chunk_size)  num_threads(12)
-      for(unsigned i=0;i<num_testcases;++i)
-      {
-	testcase& tc = tc_vector[i];
-        float result_avxf = g_compute_full_prob_float(&tc, 0);
-	double result = 0;
-	if (result_avxf < MIN_ACCEPTED) {
-	  double result_avxd = g_compute_full_prob_double(&tc, 0);
-	  result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
-	}
-	else
-	  result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
-	results_vec[i] = result;
-      }
-      total_time +=  get_time();
-#pragma omp parallel for schedule(dynamic,chunk_size)
-      for(unsigned i=0;i<num_testcases;++i)
-      {
-        testcase& tc = tc_vector[i];
-        float result_avxf = compute_full_prob<float>(&tc);
-	double result = 0;
-	if (result_avxf < MIN_ACCEPTED) {
-	  double result_avxd = compute_full_prob<double>(&tc);
-	  result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
-	}
-	else
-	  result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
-        baseline_results[i] = result;
-      }
-      baseline_time += get_time();
-      for(unsigned i=0;i<num_testcases;++i)
-      {
-	double baseline_result = baseline_results[i];
-	double abs_error = fabs(baseline_result-results_vec[i]);
-	double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
-	if(abs_error > 1e-5 && rel_error > 1e-5)
-        {
-	  cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
-          all_ok = false;
-        }
-	delete tc_vector[i].rs;
-	delete tc_vector[i].hap;
-	delete tc_vector[i].q;
-	delete tc_vector[i].i;
-	delete tc_vector[i].d;
-	delete tc_vector[i].c;
-      }
-      total_count += num_testcases;
-      num_testcases = 0;
-      //curr_batch_size = rand()%BATCH_SIZE + 4;     //min batch size
-      curr_batch_size = BATCH_SIZE;
-    }
    if(break_value < 0)
      break;
  }
+  if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
+  {
+    results_vec.resize(tc_vector.size());
+    baseline_results.resize(tc_vector.size());
+
+    get_time();
+#pragma omp parallel for schedule(dynamic,chunk_size)  num_threads(12)
+    for(unsigned i=0;i<num_testcases;++i)
+    {
+      float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
+      double result = 0;
+      if (result_avxf < MIN_ACCEPTED) {
+        double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
+        result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+      }
+      else
+        result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
+      results_vec[i] = result;
+      product += (tc_vector[i].rslen*tc_vector[i].haplen);
+    }
+    total_time +=  get_time();
+#if 0
+#pragma omp parallel for schedule(dynamic,chunk_size)
+    for(unsigned i=0;i<num_testcases;++i)
+    {
+      testcase& tc = tc_vector[i];
+      float result_avxf = compute_full_prob<float>(&tc);
+      double result = 0;
+      if (result_avxf < MIN_ACCEPTED) {
+        double result_avxd = compute_full_prob<double>(&tc);
+        result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+      }
+      else
+        result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
+      baseline_results[i] = result;
+    }
+    baseline_time += get_time();
+    for(unsigned i=0;i<num_testcases;++i)
+    {
+      double baseline_result = baseline_results[i];
+      double abs_error = fabs(baseline_result-results_vec[i]);
+      double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
+      if(abs_error > 1e-5 && rel_error > 1e-5)
+      {
+        cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
+        all_ok = false;
+      }
+    }
+#endif
+    for(unsigned i=0;i<num_testcases;++i)
+    {
+      delete tc_vector[i].rs;
+      delete tc_vector[i].hap;
+      delete tc_vector[i].q;
+      delete tc_vector[i].i;
+      delete tc_vector[i].d;
+      delete tc_vector[i].c;
+    }
+    total_count += num_testcases;
+    num_testcases = 0;
+    //curr_batch_size = rand()%BATCH_SIZE + 4;     //min batch size
+    curr_batch_size = BATCH_SIZE;
+  }
+  
  baseline_results.clear();
  results_vec.clear();
  tc_vector.clear();
  if(all_ok)
    cout << "All outputs acceptable\n";
  cout << "Total  vector time "<< ((double)total_time)/1e9 << " baseline time "<<baseline_time*1e-9<<"\n";
+  cout << "Product "<<product<<"\n";
  cout.flush();
  fflush(stdout);
  if(use_old_read_testcase)
--- a/public/c++/VectorPairHMM/utils.cc
+++ b/public/c++/VectorPairHMM/utils.cc
@ -278,3 +278,12 @@ uint64_t get_time(struct timespec* store_struct)
  start_time = *ptr;
  return diff_time;
 }
+
+uint64_t diff_time(struct timespec& prev_time)
+{
+  struct timespec curr_time;
+  clock_gettime(CLOCK_REALTIME, &curr_time);
+  return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
+}
+
+
--- a/public/c++/VectorPairHMM/utils.h
+++ b/public/c++/VectorPairHMM/utils.h
@ -27,6 +27,7 @@ template<class NUMBER>
 NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
 double getCurrClk();
 uint64_t get_time(struct timespec* x=0);
+uint64_t diff_time(struct timespec& prev_time);

 //bit 0 is sse4.2, bit 1 is AVX
 enum ProcessorCapabilitiesEnum