468 lines
13 KiB
C++
468 lines
13 KiB
C++
#include "headers.h"
|
|
#include "template.h"
|
|
#include "utils.h"
|
|
#include "vector_defs.h"
|
|
|
|
uint8_t ConvertChar::conversionTable[255];
|
|
float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
|
|
double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
|
|
|
|
using namespace std;
|
|
|
|
bool is_avx_supported()
|
|
{
|
|
int ecx = 0, edx = 0, ebx = 0;
|
|
__asm__("cpuid"
|
|
: "=b" (ebx),
|
|
"=c" (ecx),
|
|
"=d" (edx)
|
|
: "a" (1)
|
|
);
|
|
return ((ecx >> 28)&1) == 1;
|
|
}
|
|
|
|
bool is_sse42_supported()
|
|
{
|
|
int ecx = 0, edx = 0, ebx = 0;
|
|
__asm__("cpuid"
|
|
: "=b" (ebx),
|
|
"=c" (ecx),
|
|
"=d" (edx)
|
|
: "a" (1)
|
|
);
|
|
return ((ecx >> 20)&1) == 1;
|
|
}
|
|
|
|
uint64_t get_machine_capabilities()
|
|
{
|
|
uint64_t machine_mask = 0ull;
|
|
if(is_avx_supported())
|
|
machine_mask |= (1 << AVX_CUSTOM_IDX);
|
|
if(is_sse42_supported())
|
|
machine_mask |= (1 << SSE42_CUSTOM_IDX);
|
|
return machine_mask;
|
|
}
|
|
|
|
void initialize_function_pointers(uint64_t mask)
|
|
{
|
|
//mask = 0;
|
|
if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX)))
|
|
{
|
|
cout << "Using AVX accelerated implementation of PairHMM\n";
|
|
g_compute_full_prob_float = compute_full_prob_avxs<float>;
|
|
g_compute_full_prob_double = compute_full_prob_avxd<double>;
|
|
}
|
|
else
|
|
if(is_sse42_supported() && (mask & (1<< SSE42_CUSTOM_IDX)))
|
|
{
|
|
cout << "Using SSE4.2 accelerated implementation of PairHMM\n";
|
|
g_compute_full_prob_float = compute_full_prob_sses<float>;
|
|
g_compute_full_prob_double = compute_full_prob_ssed<double>;
|
|
}
|
|
else
|
|
{
|
|
cout << "Using un-vectorized C++ implementation of PairHMM\n";
|
|
g_compute_full_prob_float = compute_full_prob<float>;
|
|
g_compute_full_prob_double = compute_full_prob<double>;
|
|
}
|
|
}
|
|
|
|
int normalize(char c)
|
|
{
|
|
return ((int) (c - 33));
|
|
}
|
|
|
|
int read_testcase(testcase *tc, FILE* ifp)
|
|
{
|
|
char *q, *i, *d, *c, *line = NULL;
|
|
int _q, _i, _d, _c;
|
|
int x, size = 0;
|
|
ssize_t read;
|
|
|
|
read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
|
|
if (read == -1)
|
|
return -1;
|
|
|
|
|
|
tc->hap = (char *) malloc(size);
|
|
tc->rs = (char *) malloc(size);
|
|
q = (char *) malloc(size);
|
|
i = (char *) malloc(size);
|
|
d = (char *) malloc(size);
|
|
c = (char *) malloc(size);
|
|
|
|
if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
|
|
return -1;
|
|
|
|
|
|
tc->haplen = strlen(tc->hap);
|
|
tc->rslen = strlen(tc->rs);
|
|
assert(strlen(q) == tc->rslen);
|
|
assert(strlen(i) == tc->rslen);
|
|
assert(strlen(d) == tc->rslen);
|
|
assert(strlen(c) == tc->rslen);
|
|
//assert(tc->rslen < MROWS);
|
|
//tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
|
|
//tc->irs = (int *) malloc(tc->rslen*sizeof(int));
|
|
|
|
tc->q = (char *) malloc(sizeof(char) * tc->rslen);
|
|
tc->i = (char *) malloc(sizeof(char) * tc->rslen);
|
|
tc->d = (char *) malloc(sizeof(char) * tc->rslen);
|
|
tc->c = (char *) malloc(sizeof(char) * tc->rslen);
|
|
|
|
for (x = 0; x < tc->rslen; x++)
|
|
{
|
|
_q = normalize(q[x]);
|
|
_i = normalize(i[x]);
|
|
_d = normalize(d[x]);
|
|
_c = normalize(c[x]);
|
|
tc->q[x] = (_q < 6) ? 6 : _q;
|
|
//tc->q[x] = _q;
|
|
tc->i[x] = _i;
|
|
tc->d[x] = _d;
|
|
tc->c[x] = _c;
|
|
//tc->irs[x] = tc->rs[x];
|
|
}
|
|
//for (x = 0; x < tc->haplen; x++)
|
|
//tc->ihap[x] = tc->hap[x];
|
|
|
|
|
|
free(q);
|
|
free(i);
|
|
free(d);
|
|
free(c);
|
|
free(line);
|
|
|
|
|
|
|
|
return 0;
|
|
}
|
|
|
|
unsigned MAX_LINE_LENGTH = 65536;
|
|
int convToInt(std::string s)
|
|
{
|
|
int i;
|
|
std::istringstream strin(s);
|
|
strin >> i;
|
|
return i;
|
|
}
|
|
|
|
void tokenize(std::ifstream& fptr, std::vector<std::string>& tokens)
|
|
{
|
|
int i = 0;
|
|
std::string tmp;
|
|
std::vector<std::string> myVec;
|
|
vector<char> line;
|
|
line.clear();
|
|
line.resize(MAX_LINE_LENGTH);
|
|
vector<char> tmpline;
|
|
tmpline.clear();
|
|
tmpline.resize(MAX_LINE_LENGTH);
|
|
myVec.clear();
|
|
|
|
while(!fptr.eof())
|
|
{
|
|
i = 0;
|
|
bool still_read_line = true;
|
|
unsigned line_position = 0;
|
|
while(still_read_line)
|
|
{
|
|
fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH);
|
|
if(line_position + MAX_LINE_LENGTH > line.size())
|
|
line.resize(2*line.size());
|
|
for(unsigned i=0;i<MAX_LINE_LENGTH && tmpline[i] != '\0';++i,++line_position)
|
|
line[line_position] = tmpline[i];
|
|
if(fptr.eof() || !fptr.fail())
|
|
{
|
|
still_read_line = false;
|
|
line[line_position++] = '\0';
|
|
}
|
|
}
|
|
std::istringstream kap(&(line[0]));
|
|
|
|
while(!kap.eof())
|
|
{
|
|
kap >> std::skipws >> tmp;
|
|
if(tmp != "")
|
|
{
|
|
myVec.push_back(tmp);
|
|
++i;
|
|
//std::cout <<tmp <<"#";
|
|
}
|
|
tmp = "";
|
|
}
|
|
//std::cout << "\n";
|
|
if(myVec.size() > 0)
|
|
break;
|
|
}
|
|
tokens.clear();
|
|
//std::cout << "Why "<<myVec.size()<<"\n";
|
|
tokens.resize(myVec.size());
|
|
for(i=0;i<(int)myVec.size();++i)
|
|
tokens[i] = myVec[i];
|
|
line.clear();
|
|
tmpline.clear();
|
|
}
|
|
|
|
int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
|
|
{
|
|
static bool first_call = true;
|
|
vector<string> tokens;
|
|
tokens.clear();
|
|
tokenize(fptr, tokens);
|
|
if(tokens.size() == 0)
|
|
return -1;
|
|
tc->hap = new char[tokens[0].size()+2];
|
|
tc->haplen = tokens[0].size();
|
|
memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
|
|
tc->rs = new char[tokens[1].size()+2];
|
|
tc->rslen = tokens[1].size();
|
|
tc->q = new char[tc->rslen];
|
|
tc->i = new char[tc->rslen];
|
|
tc->d = new char[tc->rslen];
|
|
tc->c = new char[tc->rslen];
|
|
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
|
|
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
|
|
assert(tokens.size() == 2 + 4*(tc->rslen));
|
|
//assert(tc->rslen < MROWS);
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
|
|
|
|
if(reformat)
|
|
{
|
|
ofstream ofptr;
|
|
ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app);
|
|
assert(ofptr.is_open());
|
|
ofptr << tokens[0] << " ";
|
|
ofptr << tokens[1] << " ";
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
ofptr << ((char)(tc->q[j]+33));
|
|
ofptr << " ";
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
ofptr << ((char)(tc->i[j]+33));
|
|
ofptr << " ";
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
ofptr << ((char)(tc->d[j]+33));
|
|
ofptr << " ";
|
|
for(unsigned j=0;j<tc->rslen;++j)
|
|
ofptr << ((char)(tc->c[j]+33));
|
|
ofptr << " 0 false\n";
|
|
|
|
ofptr.close();
|
|
first_call = false;
|
|
}
|
|
|
|
|
|
return tokens.size();
|
|
}
|
|
|
|
double getCurrClk() {
|
|
struct timeval tv ;
|
|
gettimeofday(&tv, NULL);
|
|
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
|
}
|
|
|
|
uint64_t get_time(struct timespec* store_struct)
|
|
{
|
|
static struct timespec start_time;
|
|
struct timespec curr_time;
|
|
struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct;
|
|
clock_gettime(CLOCK_REALTIME, ptr);
|
|
uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec);
|
|
start_time = *ptr;
|
|
return diff_time;
|
|
}
|
|
|
|
uint64_t diff_time(struct timespec& prev_time)
|
|
{
|
|
struct timespec curr_time;
|
|
clock_gettime(CLOCK_REALTIME, &curr_time);
|
|
return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
|
|
}
|
|
|
|
//#define USE_PAPI
|
|
#ifdef USE_PAPI
|
|
#include "papi.h"
|
|
#define NUM_PAPI_COUNTERS 4
|
|
#endif
|
|
|
|
uint64_t exceptions_array[128];
|
|
void do_compute(char* filename)
|
|
{
|
|
memset(exceptions_array, 0, 128*sizeof(uint64_t));
|
|
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
|
//assert(feenableexcept(FE_DIVBYZERO | FE_INVALID) >= 0);
|
|
#ifdef USE_PAPI
|
|
PAPI_num_counters();
|
|
//int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L1_ICM, PAPI_L3_TCM, PAPI_TLB_DM, PAPI_TLB_IM };
|
|
//char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1d_misses", "l1i_misses", "l3_misses", "dtlb_misses", "itlb_misses" };
|
|
//long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
//long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0, 0, 0, 0 };
|
|
//int events[NUM_PAPI_COUNTERS] = { PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_ICM };
|
|
//char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "l1i_misses"};
|
|
//assert(PAPI_event_name_to_code("PERF_COUNT_HW_STALLED_CYCLES_FRONTEND",&(events[2])) == PAPI_OK);
|
|
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
//assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL",&(events[2])) == PAPI_OK);
|
|
//assert(PAPI_event_name_to_code("MACHINE_CLEARS:e",&(events[3])) == PAPI_OK);
|
|
char* eventnames[NUM_PAPI_COUNTERS]= { "instructions", "cycles", "ifetch_stall", "store_misses" };
|
|
assert(PAPI_event_name_to_code("ix86arch::INSTRUCTION_RETIRED",&(events[0])) == PAPI_OK);
|
|
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES",&(events[1])) == PAPI_OK);
|
|
assert(PAPI_event_name_to_code("ICACHE:IFETCH_STALL", &(events[2])) == PAPI_OK);
|
|
assert(PAPI_event_name_to_code("perf::L1-DCACHE-STORE-MISSES", &(events[3])) == PAPI_OK);
|
|
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
|
|
|
|
#endif
|
|
#define BATCH_SIZE 100000
|
|
bool use_old_read_testcase = true;
|
|
unsigned chunk_size = 100;
|
|
std::ifstream ifptr;
|
|
FILE* fptr = 0;
|
|
if(use_old_read_testcase)
|
|
{
|
|
fptr = fopen(filename,"r");
|
|
if(fptr == 0)
|
|
cerr << "Could not open file "<<filename<<"\n";
|
|
assert(fptr);
|
|
}
|
|
else
|
|
{
|
|
ifptr.open(filename);
|
|
assert(ifptr.is_open());
|
|
}
|
|
vector<testcase> tc_vector;
|
|
tc_vector.clear();
|
|
vector<double> results_vec;
|
|
results_vec.clear();
|
|
vector<double> baseline_results;
|
|
baseline_results.clear();
|
|
|
|
bool all_ok = true;
|
|
uint64_t total_time = 0;
|
|
uint64_t baseline_time = 0;
|
|
unsigned total_count = 0;
|
|
unsigned num_testcases = 0;
|
|
//unsigned curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
|
unsigned curr_batch_size = BATCH_SIZE;
|
|
|
|
testcase tc_in;
|
|
int break_value = 0;
|
|
while(1)
|
|
{
|
|
break_value = use_old_read_testcase ? read_testcase(&tc_in, fptr) :
|
|
read_mod_testcase(ifptr, &tc_in, true);
|
|
tc_vector.push_back(tc_in);
|
|
if(break_value >= 0)
|
|
++num_testcases;
|
|
if(num_testcases == curr_batch_size || (break_value < 0 && num_testcases > 0))
|
|
{
|
|
results_vec.resize(tc_vector.size());
|
|
baseline_results.resize(tc_vector.size());
|
|
|
|
get_time();
|
|
#ifdef USE_PAPI
|
|
assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
#endif
|
|
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
{
|
|
double result = 0;
|
|
float result_avxf = g_compute_full_prob_float(&(tc_vector[i]), 0);
|
|
if (result_avxf < MIN_ACCEPTED) {
|
|
double result_avxd = g_compute_full_prob_double(&(tc_vector[i]), 0);
|
|
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
|
}
|
|
else
|
|
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
|
results_vec[i] = result;
|
|
}
|
|
#ifdef USE_PAPI
|
|
//assert(PAPI_accum_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
|
|
#endif
|
|
total_time += get_time();
|
|
#ifdef USE_PAPI
|
|
for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
|
|
accum_values[k] += values[k];
|
|
#endif
|
|
|
|
#if 0
|
|
#pragma omp parallel for schedule(dynamic,chunk_size)
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
{
|
|
testcase& tc = tc_vector[i];
|
|
float result_avxf = compute_full_prob<float>(&tc);
|
|
double result = 0;
|
|
if (result_avxf < MIN_ACCEPTED) {
|
|
double result_avxd = compute_full_prob<double>(&tc);
|
|
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
|
|
}
|
|
else
|
|
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
|
|
baseline_results[i] = result;
|
|
}
|
|
baseline_time += get_time();
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
{
|
|
double baseline_result = baseline_results[i];
|
|
double abs_error = fabs(baseline_result-results_vec[i]);
|
|
double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
|
|
if(abs_error > 1e-5 && rel_error > 1e-5)
|
|
{
|
|
cout << "Line "<<total_count+i<< " " << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
|
|
all_ok = false;
|
|
}
|
|
}
|
|
#endif
|
|
for(unsigned i=0;i<num_testcases;++i)
|
|
{
|
|
delete tc_vector[i].rs;
|
|
delete tc_vector[i].hap;
|
|
delete tc_vector[i].q;
|
|
delete tc_vector[i].i;
|
|
delete tc_vector[i].d;
|
|
delete tc_vector[i].c;
|
|
}
|
|
total_count += num_testcases;
|
|
num_testcases = 0;
|
|
tc_vector.clear();
|
|
baseline_results.clear();
|
|
results_vec.clear();
|
|
//curr_batch_size = rand()%BATCH_SIZE + 4; //min batch size
|
|
curr_batch_size = BATCH_SIZE;
|
|
if(break_value < 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
baseline_results.clear();
|
|
results_vec.clear();
|
|
tc_vector.clear();
|
|
if(all_ok)
|
|
cout << "All outputs acceptable\n";
|
|
#ifdef USE_PAPI
|
|
for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
|
|
cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
|
|
#endif
|
|
cout << "Total vector time "<< (total_time*1e-9) << " baseline time "<<baseline_time*1e-9<<"\n";
|
|
cout.flush();
|
|
fflush(stdout);
|
|
if(use_old_read_testcase)
|
|
fclose(fptr);
|
|
else
|
|
ifptr.close();
|
|
//cout << "Exceptions "<<exceptions_array[FE_INVALID]<< " "
|
|
//<<exceptions_array[__FE_DENORM]<< " "
|
|
//<<exceptions_array[FE_DIVBYZERO]<< " "
|
|
//<<exceptions_array[FE_OVERFLOW]<< " "
|
|
//<<exceptions_array[FE_UNDERFLOW]<< " "
|
|
//<<exceptions_array[FE_INEXACT]<< "\n";
|
|
|
|
}
|