Now has PAPI values

This commit is contained in:
Karthik Gururaj 2014-02-25 21:44:20 -08:00 committed by Intel Repocontact
parent e32e9e6af6
commit 15fe244e4b
9 changed files with 229 additions and 82 deletions

View File

@ -1043,7 +1043,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
referenceConfidenceModel.close();
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
//TODO (open & close) at the walker, not the engine.
likelihoodCalculationEngine.close();
likelihoodCalculationEngine.close();
logger.info("Ran local assembly on " + result + " active regions");
}

View File

@ -75,8 +75,9 @@ import java.io.IOException;
*/
public class DebugJNILoglessPairHMM extends LoglessPairHMM {
private static final boolean dumpSandboxOnly = false; //simulates ifdef
private static final boolean debug = false; //simulates ifdef
private static final boolean verify = debug || true; //simulates ifdef
private static final boolean verify = !dumpSandboxOnly && (debug || true); //simulates ifdef
private static final boolean debug0_1 = false; //simulates ifdef
private static final boolean debug1 = false; //simulates ifdef
private static final boolean debug2 = false;
@ -134,9 +135,11 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
public void initialize( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList,
final int readMaxLength, final int haplotypeMaxLength ) {
if(verify)
{
super.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);
jniPairHMM.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);
haplotypeToHaplotypeListIdxMap = jniPairHMM.getHaplotypeToHaplotypeListIdxMap();
jniPairHMM.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);
haplotypeToHaplotypeListIdxMap = jniPairHMM.getHaplotypeToHaplotypeListIdxMap();
}
}
/**
@ -145,7 +148,8 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
@Override
public void finalizeRegion()
{
jniPairHMM.finalizeRegion();
if(!dumpSandboxOnly)
jniPairHMM.finalizeRegion();
}
/**
@ -204,46 +208,61 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
++idx;
}
}
jniPairHMM.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
double[] likelihoodArray = jniPairHMM.getLikelihoodArray();
//to compare values
final PerReadAlleleLikelihoodMap likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
double[] likelihoodArray = null;
PerReadAlleleLikelihoodMap likelihoodMap = null;
if(verify)
{
//re-order values in likelihoodArray
double[] tmpArray = new double[numHaplotypes];
idx = 0;
int idxInsideHaplotypeList = 0;
int readIdx = 0;
for(GATKSAMRecord read : reads)
jniPairHMM.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
likelihoodArray = jniPairHMM.getLikelihoodArray();
//to compare values
likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
}
else
{
likelihoodMap = new PerReadAlleleLikelihoodMap();
likelihoodArray = new double[numTestcases];
for(int i=0;i<numTestcases;++i)
likelihoodArray[i] = -0.5;
}
if(verify || dumpSandboxOnly)
{
boolean toDump = dumpSandboxOnly ? true : false;
if(verify)
{
for(int j=0;j<numHaplotypes;++j)
tmpArray[j] = likelihoodArray[readIdx+j];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
//re-order values in likelihoodArray
double[] tmpArray = new double[numHaplotypes];
idx = 0;
int idxInsideHaplotypeList = 0;
int readIdx = 0;
for(GATKSAMRecord read : reads)
{
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList];
++idx;
for(int j=0;j<numHaplotypes;++j)
tmpArray[j] = likelihoodArray[readIdx+j];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
{
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList];
++idx;
}
readIdx += numHaplotypes;
}
readIdx += numHaplotypes;
}
//for floating point values, no exact equality
//check whether numbers are close in terms of abs_error or relative_error
//For very large values, relative_error is relevant
//For very small values, abs_error is relevant
boolean toDump = false;
for(int i=0;i<likelihoodArray.length;++i)
{
double abs_error = Math.abs(likelihoodArray[i] - mLikelihoodArray[i]);
double relative_error = 0;
if(mLikelihoodArray[i] == 0)
relative_error = 0;
else
relative_error = Math.abs(abs_error/mLikelihoodArray[i]);
if(abs_error > 1e-5 && relative_error > 1e-5)
//for floating point values, no exact equality
//check whether numbers are close in terms of abs_error or relative_error
//For very large values, relative_error is relevant
//For very small values, abs_error is relevant
for(int i=0;i<likelihoodArray.length;++i)
{
toDump = true;
break;
double abs_error = Math.abs(likelihoodArray[i] - mLikelihoodArray[i]);
double relative_error = 0;
if(mLikelihoodArray[i] == 0)
relative_error = 0;
else
relative_error = Math.abs(abs_error/mLikelihoodArray[i]);
if(abs_error > 1e-6 && relative_error > 1e-6)
{
toDump = true;
break;
}
}
}
//if numbers are not close, then dump out the data that produced the inconsistency
@ -251,24 +270,40 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
{
idx = 0;
System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
boolean firstLine = true;
for(GATKSAMRecord read : reads)
{
byte [] overallGCP = GCPArrayMap.get(read);
byte[] tmpByteArray = new byte[read.getReadBases().length];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always
{
byte[] haplotypeBases = currEntry.getValue().getBases();
debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true);
debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true);
for(int k=0;k<read.getReadBases().length;++k)
debugDump("debug_dump.txt",String.format("%d",(int)(read.getBaseQualities()[k]))+" ",true);
tmpByteArray[k] = (byte)((int)((read.getBaseQualities())[k]) + 33);
debugDump("debug_dump.txt",new String(tmpByteArray)+" ",true);
for(int k=0;k<read.getReadBases().length;++k)
debugDump("debug_dump.txt",String.format("%d",(int)(read.getBaseInsertionQualities()[k]))+" ",true);
tmpByteArray[k] = (byte)((int)((read.getBaseInsertionQualities())[k]) + 33);
debugDump("debug_dump.txt",new String(tmpByteArray)+" ",true);
for(int k=0;k<read.getReadBases().length;++k)
debugDump("debug_dump.txt",String.format("%d",(int)(read.getBaseDeletionQualities()[k]))+" ",true);
tmpByteArray[k] = (byte)((int)((read.getBaseDeletionQualities())[k]) + 33);
debugDump("debug_dump.txt",new String(tmpByteArray)+" ",true);
for(int k=0;k<read.getReadBases().length;++k)
debugDump("debug_dump.txt",String.format("%d",(int)(overallGCP[k]))+" ",true);
debugDump("debug_dump.txt","\n",true);
debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
tmpByteArray[k] = (byte)((int)(overallGCP[k]) + 33);
debugDump("debug_dump.txt",new String(tmpByteArray),true);
if(firstLine)
{
debugDump("debug_dump.txt",String.format(" %d %d\n",readListSize, numHaplotypes), true);
firstLine = false;
}
else
debugDump("debug_dump.txt","\n",true);
if(verify)
debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
else
if(dumpSandboxOnly)
likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]);
++idx;
}
}

View File

@ -12,3 +12,5 @@ reformat
subdir_checkout.sh
avx/
sse/
triplicate.sh

View File

@ -47,7 +47,13 @@ LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loa
m_filename_to_fptr.clear();
m_written_files_set.clear();
//Common buffer - 8MB
unsigned size = 1024*1024;
m_buffer = new uint64_t[size];
m_buffer_size = size*sizeof(uint64_t);
initialize_function_pointers();
cout.flush();
}

View File

@ -22,6 +22,7 @@ class LoadTimeInitializer
{
public:
LoadTimeInitializer(); //will be called when library is loaded
~LoadTimeInitializer() { delete m_buffer; }
void print_profiling();
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
void debug_close();
@ -43,6 +44,8 @@ class LoadTimeInitializer
uint64_t m_data_transfer_time;
//bytes copied
uint64_t m_bytes_copied;
unsigned get_buffer_size() { return m_buffer_size; }
char* get_buffer() { return (char*)m_buffer; }
private:
std::map<std::string, std::ofstream*> m_filename_to_fptr;
std::set<std::string> m_written_files_set;
@ -52,6 +55,8 @@ class LoadTimeInitializer
double m_sum_square_stats[TOTAL_NUMBER_STATS];
uint64_t m_min_stats[TOTAL_NUMBER_STATS];
uint64_t m_max_stats[TOTAL_NUMBER_STATS];
unsigned m_buffer_size;
uint64_t* m_buffer;
};
extern LoadTimeInitializer g_load_time_initializer;

View File

@ -73,7 +73,9 @@ JNIEXPORT void JNICALL Java_Sandbox_doEverythingNative
(JNIEnv* env, jobject thisObject, jstring fileNameString)
{
const char* fileName = env->GetStringUTFChars(fileNameString, 0);
do_compute((char*)fileName);
char local_array[800];
strncpy(local_array, fileName, 200);
env->ReleaseStringUTFChars(fileNameString, fileName);
do_compute(local_array, true, 10000, false);
}

View File

@ -1,6 +1,8 @@
#include "headers.h"
#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
@ -10,12 +12,28 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
int COLS = tc->haplen + 1;
Context<NUMBER> ctx;
//#define USE_STACK_ALLOCATION 1
#ifdef USE_STACK_ALLOCATION
NUMBER M[ROWS][COLS];
NUMBER X[ROWS][COLS];
NUMBER Y[ROWS][COLS];
NUMBER p[ROWS][6];
#else
//allocate on heap in way that simulates a 2D array. Having a 2D array instead of
//a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
//the chance of being stored together in the cache. Also, prefetchers can learn memory access
//patterns for 2D arrays, not possible for array of pointers
//bool locally_allocated = false;
//NUMBER* common_buffer = 0;
NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
//if(true)
//{
//common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//locally_allocated = true;
//}
//else
//common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer());
//pointers to within the allocated buffer
NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
NUMBER* ptr = common_buffer;
@ -25,14 +43,11 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
for(;i<4*ROWS;++i, ptr+=6)
common_pointer_buffer[i] = ptr;
//NUMBER M[ROWS][COLS];
//NUMBER X[ROWS][COLS];
//NUMBER Y[ROWS][COLS];
//NUMBER p[ROWS][6];
NUMBER** M = common_pointer_buffer;
NUMBER** X = M + ROWS;
NUMBER** Y = X + ROWS;
NUMBER** p = Y + ROWS;
#endif
p[0][MM] = ctx._(0.0);
@ -111,8 +126,11 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
if (before_last_log != NULL)
*before_last_log = result;
#ifndef USE_STACK_ALLOCATION
delete common_pointer_buffer;
//if(locally_allocated)
delete common_buffer;
#endif
return result;
//return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;

View File

@ -191,25 +191,32 @@ inline JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_Vector
}
}
//#define DO_WARMUP
//#define DO_REPEAT_PROFILING
//Do compute over vector of testcase structs
inline void compute_testcases(vector<testcase>& tc_array, unsigned numTestCases, double* likelihoodDoubleArray,
unsigned maxNumThreadsToUse)
{
#pragma omp parallel for schedule (dynamic,10000) num_threads(maxNumThreadsToUse)
for(unsigned tc_idx=0;tc_idx<numTestCases;++tc_idx)
{
float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
#ifdef DO_PROFILING
g_load_time_initializer.update_stat(NUM_DOUBLE_INVOCATIONS_IDX, 1);
#ifdef DO_REPEAT_PROFILING
for(unsigned i=0;i<10;++i)
#endif
{
#pragma omp parallel for schedule (dynamic,10000) num_threads(maxNumThreadsToUse)
for(unsigned tc_idx=0;tc_idx<numTestCases;++tc_idx)
{
float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
#ifdef DO_PROFILING
g_load_time_initializer.update_stat(NUM_DOUBLE_INVOCATIONS_IDX, 1);
#endif
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
likelihoodDoubleArray[tc_idx] = result;
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
likelihoodDoubleArray[tc_idx] = result;
}
}
@ -227,6 +234,10 @@ inline JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_Vector
readBasesArrayVector.clear();
}
#ifdef DO_WARMUP
uint64_t g_sum = 0;
#endif
//JNI function to invoke compute_full_prob_avx
//readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc
//haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases
@ -268,6 +279,24 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
assert(likelihoodDoubleArray && "likelihoodArray is NULL");
assert(env->GetArrayLength(likelihoodArray) == numTestCases);
#endif
#ifdef DO_WARMUP
vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
for(unsigned i=0;i<haplotypeBasesArrayVector.size();++i)
{
unsigned curr_size = env->GetArrayLength(haplotypeBasesArrayVector[i].first);
for(unsigned j=0;j<curr_size;++j)
g_sum += ((uint64_t)((haplotypeBasesArrayVector[i].second)[j]));
}
for(unsigned i=0;i<readBasesArrayVector.size();++i)
{
for(unsigned j=0;j<readBasesArrayVector[i].size();++j)
{
unsigned curr_size = env->GetArrayLength(readBasesArrayVector[i][j].first);
for(unsigned k=0;k<curr_size;++k)
g_sum += ((uint64_t)((readBasesArrayVector[i][j].second)[k]));
}
}
#endif
#ifdef DO_PROFILING
g_load_time_initializer.m_bytes_copied += (is_copy ? numTestCases*sizeof(double) : 0);
get_time(&start_time);

View File

@ -61,6 +61,7 @@ uint64_t get_machine_capabilities()
void initialize_function_pointers(uint64_t mask)
{
//mask = 0ull;
//mask = (1 << SSE41_CUSTOM_IDX);
if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX)))
{
cout << "Using AVX accelerated implementation of PairHMM\n";
@ -286,6 +287,13 @@ double getCurrClk() {
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
inline unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void get_time(struct timespec* store_struct)
{
clock_gettime(CLOCK_REALTIME, store_struct);
@ -298,7 +306,13 @@ uint64_t diff_time(struct timespec& prev_time)
return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec));
}
//#define DUMP_COMPUTE_VALUES 1
#define DUMP_COMPUTE_VALUES 1
//#define DO_REPEATS
#ifdef USE_PAPI
#include "papi.h"
#define NUM_PAPI_COUNTERS 4
#endif
#define BATCH_SIZE 10000
#define RUN_HYBRID
void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size, bool do_check)
@ -321,7 +335,22 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
uint64_t vector_compute_time = 0;
uint64_t baseline_compute_time = 0;
uint64_t num_double_calls = 0;
unsigned num_testcases = 0;
bool all_ok = do_check ? true : false;
#ifdef USE_PAPI
uint32_t all_mask = (0);
uint32_t no_usr_mask = (1 << 16); //bit 16 user mode, bit 17 kernel mode
uint32_t no_kernel_mask = (1 << 17); //bit 16 user mode, bit 17 kernel mode
PAPI_num_counters();
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" };
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK);
assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION", &(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION", &(events[3])) == PAPI_OK);
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
#endif
while(1)
{
int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true);
@ -336,26 +365,42 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
results_vec.resize(tc_vector.size());
baseline_results_vec.resize(tc_vector.size());
struct timespec start_time;
#ifdef USE_PAPI
assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
#endif
get_time(&start_time);
#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12)
for(unsigned i=0;i<tc_vector.size();++i)
{
testcase& tc = tc_vector[i];
float result_avxf = g_compute_full_prob_float(&tc, 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&tc, 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
++num_double_calls;
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
#ifdef DUMP_COMPUTE_VALUES
g_load_time_initializer.debug_dump("return_values_vector.txt",to_string(result),true);
#ifdef DO_REPEATS
for(unsigned z=0;z<10;++z)
#endif
results_vec[i] = result;
{
for(unsigned i=0;i<tc_vector.size();++i)
{
testcase& tc = tc_vector[i];
float result_avxf = g_compute_full_prob_float(&tc, 0);
double result = 0;
if (result_avxf < MIN_ACCEPTED) {
double result_avxd = g_compute_full_prob_double(&tc, 0);
result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
++num_double_calls;
}
else
result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
#ifdef DUMP_COMPUTE_VALUES
g_load_time_initializer.debug_dump("return_values_vector.txt",to_string(result),true);
#endif
results_vec[i] = result;
}
}
#ifdef USE_PAPI
assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
#endif
vector_compute_time += diff_time(start_time);
#ifdef USE_PAPI
for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
accum_values[k] += values[k];
#endif
num_testcases += tc_vector.size();
if(do_check)
{
get_time(&start_time);
@ -401,10 +446,15 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
if(all_ok)
{
cout << "All output values within acceptable error\n";
cout << "Baseline compute time "<<baseline_compute_time*1e-9<<"\n";
cout << "Baseline double precision compute time "<<baseline_compute_time*1e-9<<"\n";
}
cout << "Num double invocations "<<num_double_calls<<"\n";
cout << "Num testcase "<<num_testcases<< " num double invocations "<<num_double_calls<<"\n";
cout << "Vector compute time "<< vector_compute_time*1e-9 << "\n";
#ifdef USE_PAPI
for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
#endif
if(use_old_read_testcase)
fclose(fptr);
else