diff --git a/public/c++/VectorPairHMM/LoadTimeInitializer.cc b/public/c++/VectorPairHMM/LoadTimeInitializer.cc index b8ff2ddf8..fd2d1df09 100644 --- a/public/c++/VectorPairHMM/LoadTimeInitializer.cc +++ b/public/c++/VectorPairHMM/LoadTimeInitializer.cc @@ -1,6 +1,19 @@ #include "LoadTimeInitializer.h" #include "utils.h" using namespace std; +char* LoadTimeInitializerStatsNames[] = +{ + "num_regions", + "num_reads", + "num_haplotypes", + "num_testcases", + "num_double_invocations", + "haplotype_length", + "readlength", + "product_read_length_haplotype_length", + "dummy" +}; + LoadTimeInitializer g_load_time_initializer; @@ -16,25 +29,23 @@ LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loa #else cout << "FTZ is not set - may slow down performance if denormal numbers encountered\n"; #endif - m_sumNumReads = 0; - m_sumSquareNumReads = 0; - m_sumNumHaplotypes = 0; - m_sumSquareNumHaplotypes = 0; - m_sumNumTestcases = 0; - m_sumNumDoubleTestcases = 0; - m_sumSquareNumTestcases = 0; - m_sumReadLengths = 0; - m_sumHaplotypeLengths = 0; - m_sumProductReadLengthHaplotypeLength = 0; - m_sumSquareProductReadLengthHaplotypeLength = 0; - m_maxNumTestcases = 0; - m_num_invocations = 0; - + //Profiling: times for compute and transfer (either bytes copied or pointers copied) m_compute_time = 0; m_data_transfer_time = 0; m_bytes_copied = 0; + //Initialize profiling counters + for(unsigned i=0;i C++) "<open(filename.c_str(), to_append ? ios::app : ios::out); assert(fptr->is_open()); } @@ -121,3 +157,12 @@ void LoadTimeInitializer::dump_sandbox(testcase& tc, unsigned tc_idx, unsigned n dumpFptr << " "<< numReads << " "< #include "template.h" + +enum LoadTimeInitializerStatsEnum +{ + NUM_REGIONS_IDX=0, + NUM_READS_IDX, + NUM_HAPLOTYPES_IDX, + NUM_TESTCASES_IDX, + NUM_DOUBLE_INVOCATIONS_IDX, + HAPLOTYPE_LENGTH_IDX, + READ_LENGTH_IDX, + PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, + TOTAL_NUMBER_STATS +}; +extern char* LoadTimeInitializerStatsNames[]; + class LoadTimeInitializer { public: @@ -21,20 +36,8 @@ class LoadTimeInitializer jfieldID m_deletionGOPFID; jfieldID m_overallGCPFID; jfieldID m_haplotypeBasesFID; - //used to compute avg, variance of #testcases - double m_sumNumReads; - double m_sumSquareNumReads; - double m_sumNumHaplotypes; - double m_sumSquareNumHaplotypes; - double m_sumNumTestcases; - double m_sumSquareNumTestcases; - uint64_t m_sumNumDoubleTestcases; - uint64_t m_sumReadLengths; - uint64_t m_sumHaplotypeLengths; - uint64_t m_sumProductReadLengthHaplotypeLength; - double m_sumSquareProductReadLengthHaplotypeLength; - unsigned m_maxNumTestcases; - unsigned m_num_invocations; + //profiling - update stats + void update_stat(LoadTimeInitializerStatsEnum stat_idx, uint64_t value); //timing in nanoseconds uint64_t m_compute_time; uint64_t m_data_transfer_time; @@ -42,7 +45,13 @@ class LoadTimeInitializer uint64_t m_bytes_copied; private: std::map m_filename_to_fptr; + std::set m_written_files_set; std::ofstream m_sandbox_fptr; + //used to compute various stats + uint64_t m_sum_stats[TOTAL_NUMBER_STATS]; + double m_sum_square_stats[TOTAL_NUMBER_STATS]; + uint64_t m_min_stats[TOTAL_NUMBER_STATS]; + uint64_t m_max_stats[TOTAL_NUMBER_STATS]; }; extern LoadTimeInitializer g_load_time_initializer; diff --git a/public/c++/VectorPairHMM/baseline.cc b/public/c++/VectorPairHMM/baseline.cc index eb233d5c3..268f32f00 100644 --- a/public/c++/VectorPairHMM/baseline.cc +++ b/public/c++/VectorPairHMM/baseline.cc @@ -3,7 +3,7 @@ #include "utils.h" template -NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL) +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log) { int r, c; int ROWS = tc->rslen + 1; diff --git a/public/c++/VectorPairHMM/headers.h b/public/c++/VectorPairHMM/headers.h index 9e4600136..48bd4d836 100644 --- a/public/c++/VectorPairHMM/headers.h +++ b/public/c++/VectorPairHMM/headers.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/public/c++/VectorPairHMM/jni_common.h b/public/c++/VectorPairHMM/jni_common.h index 4c63a0411..1cffea1cc 100644 --- a/public/c++/VectorPairHMM/jni_common.h +++ b/public/c++/VectorPairHMM/jni_common.h @@ -5,8 +5,8 @@ /*#define ENABLE_ASSERTIONS 1*/ #define DO_PROFILING 1 /*#define DEBUG 1*/ -//#define DEBUG0_1 1 -//#define DEBUG3 1 +/*#define DEBUG0_1 1*/ +/*#define DEBUG3 1*/ /*#define DUMP_TO_SANDBOX 1*/ diff --git a/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc b/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc index c39d2ec3f..0f5219dd4 100644 --- a/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc +++ b/public/c++/VectorPairHMM/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc @@ -92,45 +92,20 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless g_load_time_initializer.debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true); #endif #ifdef DO_PROFILING - g_load_time_initializer.m_sumHaplotypeLengths += haplotypeBasesLength; + g_load_time_initializer.update_stat(HAPLOTYPE_LENGTH_IDX, haplotypeBasesLength); g_load_time_initializer.m_bytes_copied += (is_copy ? haplotypeBasesLength : 0); #endif } } -//JNI function to invoke compute_full_prob_avx -//readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc -//haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases -//likelihoodArray - array of doubles to return results back to Java. Memory allocated by Java prior to JNI call -//maxNumThreadsToUse - Max number of threads that OpenMP can use for the HMM computation -JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods - (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, - jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +inline JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeTestcasesVector + (JNIEnv* env, jint numReads, jint numHaplotypes, jobjectArray& readDataArray, + vector > >& readBasesArrayVector, vector& tc_array) { -#ifdef DEBUG0_1 - cout << "JNI numReads "< >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; - jboolean is_copy = JNI_FALSE; - - unsigned numTestCases = numReads*numHaplotypes; - //vector to store results - vector tc_array; - tc_array.clear(); - tc_array.resize(numTestCases); unsigned tc_idx = 0; - //Store arrays for release later - vector > > readBasesArrayVector; - readBasesArrayVector.clear(); - readBasesArrayVector.resize(numReads); -#ifdef DO_PROFILING - start_time = get_time(); -#endif -#ifdef DUMP_TO_SANDBOX - g_load_time_initializer.open_sandbox(); -#endif for(unsigned i=0;i& tc_array, unsigned numTestCases, double* likelihoodDoubleArray, + unsigned maxNumThreadsToUse) +{ +#pragma omp parallel for schedule (dynamic,10000) num_threads(maxNumThreadsToUse) + for(unsigned tc_idx=0;tc_idx > >& readBasesArrayVector) +{ + //Release read arrays first + for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET + { + for(int j=readBasesArrayVector[i].size()-1;j>=0;--j) + RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE); + readBasesArrayVector[i].clear(); + } + readBasesArrayVector.clear(); +} + +//JNI function to invoke compute_full_prob_avx +//readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc +//haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases +//likelihoodArray - array of doubles to return results back to Java. Memory allocated by Java prior to JNI call +//maxNumThreadsToUse - Max number of threads that OpenMP can use for the HMM computation +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods + (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, + jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +{ +#ifdef DEBUG0_1 + cout << "JNI numReads "< tc_array; + tc_array.clear(); + tc_array.resize(numTestCases); + //Store read arrays for release later + vector > > readBasesArrayVector; + readBasesArrayVector.clear(); + readBasesArrayVector.resize(numReads); +#ifdef DUMP_TO_SANDBOX + g_load_time_initializer.open_sandbox(); +#endif +#ifdef DO_PROFILING + get_time(&start_time); +#endif + //Copy byte array references from Java memory into vector of testcase structs + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeTestcasesVector(env, + numReads, numHaplotypes, readDataArray, readBasesArrayVector, tc_array); +#ifdef DO_PROFILING + g_load_time_initializer.m_data_transfer_time += diff_time(start_time); #endif jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy); @@ -230,65 +270,29 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless #endif #ifdef DO_PROFILING g_load_time_initializer.m_bytes_copied += (is_copy ? numTestCases*sizeof(double) : 0); - struct timespec prev_time; - clock_gettime(CLOCK_REALTIME, &prev_time); + get_time(&start_time); #endif -#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) - for(tc_idx=0;tc_idx=0;--i)//note the order - reverse of GET - { - for(int j=readBasesArrayVector[i].size()-1;j>=0;--j) - RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE); - readBasesArrayVector[i].clear(); - } - readBasesArrayVector.clear(); + RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory (if copy made) + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniReleaseReadArrays(env, readBasesArrayVector); #ifdef DO_PROFILING - g_load_time_initializer.m_data_transfer_time += get_time(); + g_load_time_initializer.m_data_transfer_time += diff_time(start_time); + g_load_time_initializer.update_stat(NUM_REGIONS_IDX, 1); + g_load_time_initializer.update_stat(NUM_READS_IDX, numReads); + g_load_time_initializer.update_stat(NUM_HAPLOTYPES_IDX, numHaplotypes); + g_load_time_initializer.update_stat(NUM_TESTCASES_IDX, numTestCases); #endif tc_array.clear(); -#ifdef DO_PROFILING - g_load_time_initializer.m_sumNumReads += numReads; - g_load_time_initializer.m_sumSquareNumReads += numReads*numReads; - g_load_time_initializer.m_sumNumHaplotypes += numHaplotypes; - g_load_time_initializer.m_sumSquareNumHaplotypes += numHaplotypes*numHaplotypes; - g_load_time_initializer.m_sumNumTestcases += numTestCases; - g_load_time_initializer.m_sumSquareNumTestcases += numTestCases*numTestCases; - g_load_time_initializer.m_maxNumTestcases = numTestCases > g_load_time_initializer.m_maxNumTestcases ? numTestCases - : g_load_time_initializer.m_maxNumTestcases; - ++(g_load_time_initializer.m_num_invocations); -#endif -#ifdef DEBUG - g_load_time_initializer.debug_close(); -#endif #ifdef DUMP_TO_SANDBOX g_load_time_initializer.close_sandbox(); #endif diff --git a/public/c++/VectorPairHMM/pairhmm-1-base.cc b/public/c++/VectorPairHMM/pairhmm-1-base.cc index a552aecca..8b686d5ea 100644 --- a/public/c++/VectorPairHMM/pairhmm-1-base.cc +++ b/public/c++/VectorPairHMM/pairhmm-1-base.cc @@ -14,8 +14,6 @@ int main(int argc, char** argv) cerr << "Needs path to input file as argument\n"; exit(0); } - do_compute(argv[1]); - return 0; bool use_old_read_testcase = false; if(argc >= 3 && string(argv[2]) == "1") use_old_read_testcase = true; diff --git a/public/c++/VectorPairHMM/utils.cc b/public/c++/VectorPairHMM/utils.cc index b13c84459..9974c5ace 100644 --- a/public/c++/VectorPairHMM/utils.cc +++ b/public/c++/VectorPairHMM/utils.cc @@ -2,6 +2,7 @@ #include "template.h" #include "utils.h" #include "vector_defs.h" +#include "LoadTimeInitializer.h" uint8_t ConvertChar::conversionTable[255]; float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0; @@ -271,15 +272,9 @@ double getCurrClk() { return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; } -uint64_t get_time(struct timespec* store_struct) +void get_time(struct timespec* store_struct) { - static struct timespec start_time; - struct timespec curr_time; - struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct; - clock_gettime(CLOCK_REALTIME, ptr); - uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec); - start_time = *ptr; - return diff_time; + clock_gettime(CLOCK_REALTIME, store_struct); } uint64_t diff_time(struct timespec& prev_time) @@ -289,6 +284,7 @@ uint64_t diff_time(struct timespec& prev_time) return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec)); } +//#define DUMP_COMPUTE_VALUES 1 #define CHECK_VALUES 1 #define BATCH_SIZE 10000 #define RUN_HYBRID @@ -329,7 +325,8 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size) baseline_results_vec.clear(); results_vec.resize(tc_vector.size()); baseline_results_vec.resize(tc_vector.size()); - get_time(); + struct timespec start_time; + get_time(&start_time); #pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12) for(unsigned i=0;i NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0); double getCurrClk(); -uint64_t get_time(struct timespec* x=0); +void get_time(struct timespec* x); uint64_t diff_time(struct timespec& prev_time); //bit 0 is sse4.2, bit 1 is AVX