diff --git a/public/c++/VectorPairHMM/LoadTimeInitializer.cc b/public/c++/VectorPairHMM/LoadTimeInitializer.cc index 7a4e46161..b8ff2ddf8 100644 --- a/public/c++/VectorPairHMM/LoadTimeInitializer.cc +++ b/public/c++/VectorPairHMM/LoadTimeInitializer.cc @@ -8,7 +8,9 @@ LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loa { ConvertChar::init(); #ifndef DISABLE_FTZ - //Very important to get good performance - enable FTZ, converts denormals to 0 + //Very important to get good performance on Intel processors + //Function: enabling FTZ converts denormals to 0 in hardware + //Denormals cause microcode to insert uops into the core causing big slowdown _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); cout << "FTZ enabled - may decrease accuracy if denormal numbers encountered\n"; #else diff --git a/public/c++/VectorPairHMM/Makefile b/public/c++/VectorPairHMM/Makefile index 0b564d033..269cecbd2 100644 --- a/public/c++/VectorPairHMM/Makefile +++ b/public/c++/VectorPairHMM/Makefile @@ -12,6 +12,9 @@ CC=icc CXX=icc LDFLAGS=-lm -lrt $(OMPLDFLAGS) +ifdef DISABLE_FTZ + COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz +endif PAPI_DIR=/home/karthikg/softwares/papi-5.3.0 ifdef USE_PAPI diff --git a/public/c++/VectorPairHMM/pairhmm-1-base.cc b/public/c++/VectorPairHMM/pairhmm-1-base.cc index 2c80f2a2e..a552aecca 100644 --- a/public/c++/VectorPairHMM/pairhmm-1-base.cc +++ b/public/c++/VectorPairHMM/pairhmm-1-base.cc @@ -5,7 +5,6 @@ #include "utils.h" #include "LoadTimeInitializer.h" using namespace std; -#define RUN_HYBRID int main(int argc, char** argv) { @@ -20,129 +19,11 @@ int main(int argc, char** argv) bool use_old_read_testcase = false; if(argc >= 3 && string(argv[2]) == "1") use_old_read_testcase = true; - unsigned chunk_size = 100; + unsigned chunk_size = 10000; if(argc >= 4) chunk_size = strtol(argv[3],0,10); - std::ifstream ifptr; - FILE* fptr = 0; - if(use_old_read_testcase) - { - fptr = fopen(argv[1],"r"); - assert(fptr); - } - else - { - ifptr.open(argv[1]); - assert(ifptr.is_open()); - } - vector results_vec; - vector tc_vector; - tc_vector.clear(); - tc_vector.resize(BATCH_SIZE+4); - results_vec.clear(); - vector baseline_results; - baseline_results.clear(); - - bool all_ok = true; - uint64_t total_time = 0; - uint64_t baseline_time = 0; - unsigned total_count = 0; - unsigned num_testcases = 0; - //unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size - unsigned curr_batch_size = BATCH_SIZE; - uint64_t product = 0; - - testcase tc_in; - int break_value = 0; - tc_vector.clear(); - while(1) - { - break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) : - read_mod_testcase(ifptr, &tc_in, true); - tc_vector.push_back(tc_in); - if(break_value >= 0) - ++num_testcases; - if(break_value < 0) - break; - } - if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0)) - { - results_vec.resize(tc_vector.size()); - baseline_results.resize(tc_vector.size()); - - get_time(); -#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12) - for(unsigned i=0;i(&tc); - double result = 0; - if (result_avxf < MIN_ACCEPTED) { - double result_avxd = compute_full_prob(&tc); - result = log10(result_avxd) - log10(ldexp(1.0, 1020.0)); - } - else - result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f))); - baseline_results[i] = result; - } - baseline_time += get_time(); - for(unsigned i=0;i 1e-5 && rel_error > 1e-5) - { - cout << "Line "<haplen; x++) //tc->ihap[x] = tc->hap[x]; - free(q); free(i); @@ -290,56 +289,16 @@ uint64_t diff_time(struct timespec& prev_time) return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec)); } -//#define USE_PAPI -//#define COUNT_EXCEPTIONS -//#define CHECK_RESULTS -#define CHECK_UNDERFLOW 1 -#ifdef USE_PAPI -#include "papi.h" -#define NUM_PAPI_COUNTERS 4 -#endif - -IF_32 g_converter; -FILE* g_debug_fptr = 0; -uint64_t exceptions_array[128]; -void do_compute(char* filename) +#define CHECK_VALUES 1 +#define BATCH_SIZE 10000 +#define RUN_HYBRID +void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size) { - //g_debug_fptr = fopen("/mnt/app_hdd/scratch/karthikg/dump.log","w"); - //assert(g_debug_fptr); - for(unsigned i=0;i<128;++i) - exceptions_array[i] = 0ull; - //assert(feenableexcept(FE_DIVBYZERO | FE_INVALID) >= 0); -#ifdef USE_PAPI - PAPI_num_counters(); - //int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L1_ICM, PAPI_L3_TCM, PAPI_TLB_DM, PAPI_TLB_IM }; - //char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1d_misses", "l1i_misses", "l3_misses", "dtlb_misses", "itlb_misses" }; - //long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 }; - //long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 }; - //int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_ICM }; - //char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1i_misses"}; - //assert(PAPI_event_name_to_code("PERF_COUNT_HW_STALLED_CYCLES_FRONTEND",&(events[2])) == PAPI_OK); - int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; - //assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL",&(events[2])) == PAPI_OK); - //assert(PAPI_event_name_to_code("MACHINE_CLEARS:e",&(events[3])) == PAPI_OK); - char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "fp_assists", "idq_ms_cycles" }; - assert(PAPI_event_name_to_code("ix86arch::INSTRUCTION_RETIRED",&(events[0])) == PAPI_OK); - assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES",&(events[1])) == PAPI_OK); - assert(PAPI_event_name_to_code("FP_ASSIST:ANY", &(events[2])) == PAPI_OK); - assert(PAPI_event_name_to_code("IDQ:MS_UOPS_CYCLES", &(events[3])) == PAPI_OK); - long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; - long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; - -#endif -#define BATCH_SIZE 10000 - bool use_old_read_testcase = true; - unsigned chunk_size = 100; - std::ifstream ifptr; FILE* fptr = 0; + ifstream ifptr; if(use_old_read_testcase) { fptr = fopen(filename,"r"); - if(fptr == 0) - cerr << "Could not open file "< tc_vector; tc_vector.clear(); - vector results_vec; - results_vec.clear(); - vector baseline_results; - baseline_results.clear(); - + testcase tc; + uint64_t vector_compute_time = 0; + uint64_t baseline_compute_time = 0; + uint64_t num_double_calls = 0; bool all_ok = true; - uint64_t total_time = 0; - uint64_t baseline_time = 0; - unsigned total_count = 0; - unsigned num_testcases = 0; - //unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size - unsigned curr_batch_size = BATCH_SIZE; - - testcase tc_in; - int break_value = 0; - uint64_t fp_single_exceptions_reexecute = 0; - uint64_t fp_single_exceptions_continue = 0; - uint64_t num_double_executions = 0; +#ifndef CHECK_VALUES + all_ok = false; +#endif while(1) { - break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) : - read_mod_testcase(ifptr, &tc_in, true); - tc_vector.push_back(tc_in); + int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true); if(break_value >= 0) - ++num_testcases; - if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0)) + tc_vector.push_back(tc); + if(tc_vector.size() == BATCH_SIZE || (break_value < 0 && tc_vector.size() > 0)) { + vector results_vec; + vector baseline_results_vec; + results_vec.clear(); + baseline_results_vec.clear(); results_vec.resize(tc_vector.size()); - baseline_results.resize(tc_vector.size()); - + baseline_results_vec.resize(tc_vector.size()); get_time(); -#ifdef USE_PAPI - assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK); -#endif #pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12) - for(unsigned i=0;i(&tc); - double result = 0; - if (result_avxf < MIN_ACCEPTED) { - double result_avxd = compute_full_prob(&tc); - result = log10(result_avxd) - log10(ldexp(1.0, 1020.0)); - } - else - result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f))); - baseline_results[i] = result; + testcase& tc = tc_vector[i]; + double baseline_result = compute_full_prob(&tc); + baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0)); + baseline_results_vec[i] = baseline_result; } - baseline_time += get_time(); - for(unsigned i=0;i 1e-5 && rel_error > 1e-5) + double baseline_result = baseline_results_vec[i]; + double abs_error = fabs(baseline_result-results_vec[i]); + double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0; + if(abs_error > 1e-5 && rel_error > 1e-5) { - cout << "Line "<