gatk-3.8/public/c++/VectorPairHMM/baseline.cc

#include "headers.h"
#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"

using namespace std;
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
{
  int r, c;
  int ROWS = tc->rslen + 1;
  int COLS = tc->haplen + 1;

  Context<NUMBER> ctx;
  //#define USE_STACK_ALLOCATION 1
#ifdef USE_STACK_ALLOCATION
  NUMBER M[ROWS][COLS];
  NUMBER X[ROWS][COLS];
  NUMBER Y[ROWS][COLS];
  NUMBER p[ROWS][6];
#else
  //allocate on heap in way that simulates a 2D array. Having a 2D array instead of
  //a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
  //the chance of being stored together in the cache. Also, prefetchers can learn memory access
  //patterns for 2D arrays, not possible for array of pointers
  //bool locally_allocated = false;
  //NUMBER* common_buffer = 0;
  NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
  //unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
  //if(true)
  //{
  //common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
  //locally_allocated = true;
  //}
  //else
  //common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer()); 
  //pointers to within the allocated buffer
  NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
  NUMBER* ptr = common_buffer;
  unsigned i = 0;
  for(i=0;i<3*ROWS;++i, ptr+=COLS)
    common_pointer_buffer[i] = ptr;
  for(;i<4*ROWS;++i, ptr+=6)
    common_pointer_buffer[i] = ptr;

  NUMBER** M = common_pointer_buffer;
  NUMBER** X = M + ROWS;
  NUMBER** Y = X + ROWS;
  NUMBER** p = Y + ROWS;
#endif


  p[0][MM] = ctx._(0.0);
  p[0][GapM] = ctx._(0.0);
  p[0][MX] = ctx._(0.0);
  p[0][XX] = ctx._(0.0);
  p[0][MY] = ctx._(0.0);
  p[0][YY] = ctx._(0.0);

  for (r = 1; r < ROWS; r++)
  {
    int _i = tc->i[r-1] & 127;
    int _d = tc->d[r-1] & 127;
    int _c = tc->c[r-1] & 127;
    p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
    p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
    p[r][MX] = ctx.ph2pr[_i];
    p[r][XX] = ctx.ph2pr[_c];
    p[r][MY] = ctx.ph2pr[_d];
    p[r][YY] = ctx.ph2pr[_c];
    //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
    //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
  }
  for (c = 0; c < COLS; c++)
  {
    M[0][c] = ctx._(0.0);
    X[0][c] = ctx._(0.0);
    Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
  }

  for (r = 1; r < ROWS; r++)
  {
    M[r][0] = ctx._(0.0);
    X[r][0] = X[r-1][0] * p[r][XX];
    Y[r][0] = ctx._(0.0);
  }

  NUMBER result = ctx._(0.0);

  for (r = 1; r < ROWS; r++)
    for (c = 1; c < COLS; c++)
    {
      fexcept_t flagp;
      char _rs = tc->rs[r-1];
      char _hap = tc->hap[c-1];
      int _q = tc->q[r-1] & 127;
      NUMBER distm = ctx.ph2pr[_q];
      if (_rs == _hap || _rs == 'N' || _hap == 'N')
        distm = ctx._(1.0) - distm;
      else
        distm = distm/3;


      //feclearexcept(FE_ALL_EXCEPT);
      M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
      //STORE_FP_EXCEPTIONS(flagp, exceptions_array);

      //feclearexcept(FE_ALL_EXCEPT);
      X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
      //STORE_FP_EXCEPTIONS(flagp, exceptions_array);

      //feclearexcept(FE_ALL_EXCEPT);
      Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
      //STORE_FP_EXCEPTIONS(flagp, exceptions_array);

      //CONVERT_AND_PRINT(M[r][c]);
      //CONVERT_AND_PRINT(X[r][c]);
      //CONVERT_AND_PRINT(Y[r][c]);

    }
  for (c = 0; c < COLS; c++)
  {
    result += M[ROWS-1][c] + X[ROWS-1][c];
  }

  if (before_last_log != NULL)
    *before_last_log = result;

#ifndef USE_STACK_ALLOCATION
  delete common_pointer_buffer;
  //if(locally_allocated)
  delete common_buffer;
#endif

  return result;
  //return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
}

template double compute_full_prob<double>(testcase* tc, double* nextbuf);
template float compute_full_prob<float>(testcase* tc, float* nextbuf);
Added support for dynamic selection between AVX and un-vectorized C++, still to include SSE code from Mohammad. Debug flags turned on in this commit. 2014-01-19 03:07:23 +08:00			`#include "headers.h"`
			`#include "template.h"`
1. Whew, finally debugged the source of performance issues with PairHMM JNI. See copied text from email below. 2. This commit contains all the code used in profiling, detecting FP exceptions, dumping intermediate results. All flagged off using ifdefs, but it's there. --------------Text from email As we discussed before, it's the denormal numbers that are causing the slowdown - the core executes some microcode uops (called FP assists) when denormal numbers are detected for FP operations (even un-vectorized code). The C++ compiler by default enables flush to zero (FTZ) - when set, the hardware simply converts denormal numbers to 0. The Java binary (executable provided by Oracle, not the native library) seems to be compiled without FTZ (sensible choice, they want to be conservative). Hence, the JNI invocation sees a large slowdown. Disabling FTZ in C++ slows down the C++ sandbox performance to the JNI version (fortunately, the reverse also holds :)). Not sure how to show the overhead for these FP assists easily - measured a couple of counters. FP_ASSISTS:ANY - shows number of uops executed as part of the FP assists. When FTZ is enabled, this is 0 (both C++ and JNI), when FTZ is disabled this value is around 203540557 (both C++ and JNI) IDQ:MS_UOPS_CYCLES - shows the number of cycles the decoder was issuing uops when the microcode sequencing engine was busy. When FTZ is enabled, this is around 1.77M cycles (both C++ and JNI), when FTZ is disabled this value is around 4.31B cycles (both C++ and JNI). This number is still small with respect to total cycles (~40B), but it only reflects the cycles in the decode stage. The total overhead of the microcode assist ops could be larger. As suggested by Mustafa, I compared intermediate values (matrices M,X,Y) and final output of compute_full_prob. The values produced by C++ and Java are identical to the last bit (as long as both use FTZ or no-FTZ). Comparing the outputs of compute_full_prob for the cases no-FTZ and FTZ, there are differences for very small values (denormal numbers). Examples: Diff values 1.952970E-33 1.952967E-33 Diff values 1.135071E-32 1.135070E-32 Diff values 1.135071E-32 1.135070E-32 Diff values 1.135071E-32 1.135070E-32 For this test case (low coverage NA12878), all these values would be recomputed using the double precision version. Enabling FTZ should be fine. -------------------End text from email 2014-02-06 09:09:57 +08:00			`#include "utils.h"`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`#include "LoadTimeInitializer.h"`

Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`using namespace std;`
Added support for dynamic selection between AVX and un-vectorized C++, still to include SSE code from Mohammad. Debug flags turned on in this commit. 2014-01-19 03:07:23 +08:00			`template<class NUMBER>`
1. Split main JNI function into initializeTestcases, compute_testcases and releaseReads 2. FTZ enabled 3. Cleaner profiling code 2014-02-07 06:35:32 +08:00			`NUMBER compute_full_prob(testcase tc, NUMBER before_last_log)`
Added support for dynamic selection between AVX and un-vectorized C++, still to include SSE code from Mohammad. Debug flags turned on in this commit. 2014-01-19 03:07:23 +08:00			`{`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`int r, c;`
			`int ROWS = tc->rslen + 1;`
			`int COLS = tc->haplen + 1;`

			`Context<NUMBER> ctx;`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`//#define USE_STACK_ALLOCATION 1`
			`#ifdef USE_STACK_ALLOCATION`
			`NUMBER M[ROWS][COLS];`
			`NUMBER X[ROWS][COLS];`
			`NUMBER Y[ROWS][COLS];`
			`NUMBER p[ROWS][6];`
			`#else`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`//allocate on heap in way that simulates a 2D array. Having a 2D array instead of`
			`//a straightforward array of pointers ensures that all data lies 'close' in memory, increasing`
			`//the chance of being stored together in the cache. Also, prefetchers can learn memory access`
			`//patterns for 2D arrays, not possible for array of pointers`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`//bool locally_allocated = false;`
			`//NUMBER* common_buffer = 0;`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`NUMBER* common_buffer = new NUMBER[3ROWSCOLS + ROWS*6];`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`//unsigned curr_size = sizeof(NUMBER)(3ROWSCOLS + ROWS6);`
			`//if(true)`
			`//{`
			`//common_buffer = new NUMBER[3ROWSCOLS + ROWS*6];`
			`//locally_allocated = true;`
			`//}`
			`//else`
			`//common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer());`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`//pointers to within the allocated buffer`
			`NUMBER** common_pointer_buffer = new NUMBER[4ROWS];`
			`NUMBER* ptr = common_buffer;`
			`unsigned i = 0;`
			`for(i=0;i<3*ROWS;++i, ptr+=COLS)`
			`common_pointer_buffer[i] = ptr;`
			`for(;i<4*ROWS;++i, ptr+=6)`
			`common_pointer_buffer[i] = ptr;`

			`NUMBER** M = common_pointer_buffer;`
			`NUMBER** X = M + ROWS;`
			`NUMBER** Y = X + ROWS;`
			`NUMBER** p = Y + ROWS;`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`#endif`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00

			`p[0][MM] = ctx._(0.0);`
			`p[0][GapM] = ctx._(0.0);`
			`p[0][MX] = ctx._(0.0);`
			`p[0][XX] = ctx._(0.0);`
			`p[0][MY] = ctx._(0.0);`
			`p[0][YY] = ctx._(0.0);`

			`for (r = 1; r < ROWS; r++)`
			`{`
			`int _i = tc->i[r-1] & 127;`
			`int _d = tc->d[r-1] & 127;`
			`int _c = tc->c[r-1] & 127;`
			`p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];`
			`p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];`
			`p[r][MX] = ctx.ph2pr[_i];`
			`p[r][XX] = ctx.ph2pr[_c];`
			`p[r][MY] = ctx.ph2pr[_d];`
			`p[r][YY] = ctx.ph2pr[_c];`
			`//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];`
			`//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];`
			`}`
			`for (c = 0; c < COLS; c++)`
			`{`
			`M[0][c] = ctx._(0.0);`
			`X[0][c] = ctx._(0.0);`
			`Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);`
			`}`

			`for (r = 1; r < ROWS; r++)`
			`{`
			`M[r][0] = ctx._(0.0);`
			`X[r][0] = X[r-1][0] * p[r][XX];`
			`Y[r][0] = ctx._(0.0);`
			`}`

			`NUMBER result = ctx._(0.0);`

			`for (r = 1; r < ROWS; r++)`
			`for (c = 1; c < COLS; c++)`
			`{`
			`fexcept_t flagp;`
			`char _rs = tc->rs[r-1];`
			`char _hap = tc->hap[c-1];`
			`int _q = tc->q[r-1] & 127;`
			`NUMBER distm = ctx.ph2pr[_q];`
			`if (_rs == _hap \|\| _rs == 'N' \|\| _hap == 'N')`
			`distm = ctx._(1.0) - distm;`
			`else`
			`distm = distm/3;`


			`//feclearexcept(FE_ALL_EXCEPT);`
			`M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);`
			`//STORE_FP_EXCEPTIONS(flagp, exceptions_array);`

			`//feclearexcept(FE_ALL_EXCEPT);`
			`X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];`
			`//STORE_FP_EXCEPTIONS(flagp, exceptions_array);`

			`//feclearexcept(FE_ALL_EXCEPT);`
			`Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];`
			`//STORE_FP_EXCEPTIONS(flagp, exceptions_array);`

			`//CONVERT_AND_PRINT(M[r][c]);`
			`//CONVERT_AND_PRINT(X[r][c]);`
			`//CONVERT_AND_PRINT(Y[r][c]);`

			`}`
			`for (c = 0; c < COLS; c++)`
			`{`
			`result += M[ROWS-1][c] + X[ROWS-1][c];`
			`}`

			`if (before_last_log != NULL)`
			`*before_last_log = result;`

Now has PAPI values 2014-02-26 13:44:20 +08:00			`#ifndef USE_STACK_ALLOCATION`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`delete common_pointer_buffer;`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`//if(locally_allocated)`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00			`delete common_buffer;`
Now has PAPI values 2014-02-26 13:44:20 +08:00			`#endif`
Matrices in baseline C++ (no vector) implementation of PairHMM are now allocated on heap using "new". Stack allocation led to program crashes for large matrix sizes. 2014-02-08 15:22:05 +08:00
			`return result;`
			`//return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;`
Added support for dynamic selection between AVX and un-vectorized C++, still to include SSE code from Mohammad. Debug flags turned on in this commit. 2014-01-19 03:07:23 +08:00			`}`

			`template double compute_full_prob<double>(testcase* tc, double* nextbuf);`
			`template float compute_full_prob<float>(testcase* tc, float* nextbuf);`