gatk-3.8/hmm_mask.cc

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>


// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX  -openmp -openmp-link static


#include <iostream>

#include <malloc.h>
#include <assert.h>
#include <sys/time.h>
#include <omp.h>
#include <stdlib.h>
#include <math.h>


#include <immintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>


#include <stdint.h>
#include <algorithm>
#include <vector>
#include <set>
#include <map>
#include <memory.h>
#include "template.h"

using namespace std ;

#define CHECK_MASK_CORRECTNESS

template <class T>
string getBinaryStr (T val, int numBitsToWrite) {

  ostringstream oss ;
  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
  for (int i=numBitsToWrite-1; i >= 0; --i) {
    oss << ((val & mask) >> i) ;
    mask >>= 1 ;
  }
  return oss.str() ;
}

int normalize(char c)
{
	return ((int) (c - 33));
}


uint8_t ConvertChar::conversionTable[255] ;

int read_testcase(testcase *tc, FILE* ifp)
{
	char *q, *i, *d, *c, *line = NULL;
	int _q, _i, _d, _c;
	int x, size = 0;
	ssize_t read;

	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
	if (read == -1)
		return -1;


	tc->hap = (char *) malloc(size);
	tc->rs = (char *) malloc(size);
	q = (char *) malloc(size);
	i = (char *) malloc(size);
	d = (char *) malloc(size);
	c = (char *) malloc(size);

	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
		return -1;

	tc->haplen = strlen(tc->hap);
	tc->rslen = strlen(tc->rs);
	assert(tc->rslen < MROWS);
	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
	tc->irs = (int *) malloc(tc->rslen*sizeof(int));

	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);

	for (x = 0; x < tc->rslen; x++)
	{
		_q = normalize(q[x]);
		_i = normalize(i[x]);
		_d = normalize(d[x]);
		_c = normalize(c[x]);
		tc->q[x] = (_q < 6) ? 6 : _q;
		tc->i[x] = _i;
		tc->d[x] = _d;
		tc->c[x] = _c;
		tc->irs[x] = tc->rs[x];
	}
	for (x = 0; x < tc->haplen; x++)
	  tc->ihap[x] = tc->hap[x];


	free(q);
	free(i);
	free(d);
	free(c);
	free(line);

	return 0;
}


#define ALIGN __attribute__ ((aligned(32)))


typedef union ALIGN {
  //__m256i vi;
  __m128 vf ;
  __m128i vi ;
  __m128d vd;

  uint8_t b[16];
  //uint16_t hw[8] ;
  uint32_t w[4] ;
  uint64_t dw[2] ;

  float f[4] ;
  //double d[2] ;

} v128 ;

typedef union ALIGN {
  __m256i vi;
  __m256 vf ;
  __m256d vd;
  //__m128i vi128[2] ;

  uint8_t b[32];
  //uint16_t hw[16] ;
  uint32_t w[8] ;
  uint64_t dw[4] ;

  float f[8] ;
  double d[4] ;

} v256 ;


#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4

#define VEC_ENTRY_CNT 8
#define VEC_LEN 256
#define VTYPE vf
#define VTYPEI vi
#define VENTRY f
#define VENTRYI w
#define MTYPE uint32_t
#define MASK_ALL_ONES 0xFFFFFFFF


#define VECTOR v256
#define VECTOR_SSE v128

#define SET_VEC_ZERO(__vec)			\
  __vec= _mm256_setzero_ps()

#define SET_VEC_ONES(__vec)			\
  __vec = _mm256_set1_epi32(0xFFFFFFFF)

#define VEC_OR(__v1, __v2)			\
  _mm256_or_ps(__v1, __v2)

#define VEC_ADD(__v1, __v2)			\
  _mm256_add_ps(__v1, __v2)

#define VEC_MUL(__v1, __v2)			\
  _mm256_mul_ps(__v1, __v2)

#define VEC_BLENDV(__v1, __v2, __maskV)		\
  _mm256_blendv_ps(__v1, __v2, __maskV)

#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;

#define VECS_SHIFT_LEFT_1BIT(__vs)		\
  __vs = _mm_slli_epi32(__vs, 1)


#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) {				\
    uint64_t __mask = ;							\
    uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \
    __shiftWord <<= (64-cnt) ;						\
    __vs = _mm_slri_epi64(__vs, __cnt) ;				\
    __vs.dw[0] |= __shiftWord ;						\
  }


#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
    MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
    MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
    __mask >>= __shiftBy ;						\
    __mask |= __lastShiftOut ;						\
    __lastShiftOut = __nextShiftOut ;					\
  }


#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
  MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
  MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
  __lastShiftOut = __nextShiftOut ;					\
}


void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs,
			  MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) {

  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;

  for (int vi=0; vi < numMaskVecs; ++vi) {
    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
      maskArr[vi][rs] = 0 ;
    }
    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
  }

  for (int col=1; col < COLS; ++col) {
    int mIndex = (col-1) / maskBitCnt ;
    int mOffset = (col-1) % maskBitCnt ;
    MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ;

    char hapChar = tc.hap[col-1] ;

    if (hapChar == AMBIG_CHAR) {
      maskArr[mIndex][0] |= bitMask ;
      maskArr[mIndex][1] |= bitMask ;
      maskArr[mIndex][2] |= bitMask ;
      maskArr[mIndex][3] |= bitMask ;
    }

    //cout << hapChar << " " << mIndex << " " << getBinaryStr<MTYPE>(bitMask, 32)
    // << endl ;
    //cout << getBinaryStr<MTYPE>(maskArr[0][hapChar],32) << endl ;
    //exit(0) ;

    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;


    // bit corresponding to col 1 will be the MSB of the mask 0
    // bit corresponding to col 2 will be the MSB-1 of the mask 0
    // ...
    // bit corresponding to col 32 will be the LSB of the mask 0
    // bit corresponding to col 33 will be the MSB of the mask 1
    // ...
  }

}


void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {

  int ROWS = tc.rslen + 1 ;
  int COLS = tc.haplen + 1 ;

  // only for testing
  VECTOR mismatchData, matchData ;
  //SET_VEC_ZERO(mismatchData.VTYPE) ;
  //SET_VEC_ONES(matchData.VTYPEI) ;
  for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) {
    matchData.VENTRY[ei] = 1.0 ;
    mismatchData.VENTRY[ei] = 0.0 ;
  }

  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
  const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function

  MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
  precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ;

#ifdef DEBUG
  if (printDebug) {
    cout << "The first 32 hap chars are: " ;
    for (int i=0; i < 32; ++i) {
      cout << tc.hap[i] ;
    }
    cout << endl ;

    cout << "Masks computed for A, C, T, G, N are: "  << endl ;
    cout << getBinaryStr<MTYPE>(maskArr[0][0], 32)  << endl ;
    cout << getBinaryStr<MTYPE>(maskArr[0][1], 32)  << endl ;
    cout << getBinaryStr<MTYPE>(maskArr[0][2], 32)  << endl ;
    cout << getBinaryStr<MTYPE>(maskArr[0][3], 32)  << endl ;
    cout << getBinaryStr<MTYPE>(maskArr[0][4], 32)  << endl ;
  }
#endif // #ifdef DEBUG

  int beginRowIndex = 1 ;
  while (beginRowIndex < ROWS) {

    int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ;

    char rsArr[VEC_ENTRY_CNT] ;
    for (int ri=0; ri < numRowsToProcess; ++ri) {
      rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
    }

    // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
    VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3
    VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7

    MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ;
    for (int ei=0; ei < VEC_ENTRY_CNT; ++ei)
      lastMaskShiftOut[ei] = 0 ;

    int col = 1 ;
    int diag = 1 ;
    for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) {
      // set up the mask vectors for the next maskBitCnt columns

      // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols)
      for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) {
	SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]],
		      lastMaskShiftOut[ei], ei, maskBitCnt) ;

      	int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index
	SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]],
		      lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
      }

#ifdef DEBUG
      if (printDebug && maskIndex == 0) {
	cout << "The masks for entry 1: " << endl
	     << getBinaryStr<MTYPE>(maskArr[0][rsArr[1]], 32) << endl
	     << getBinaryStr<MTYPE>(currMaskVecLow.VENTRYI[1], 32) << endl ;

      }
#endif // #ifdef DEBUG

      // iterate over mask bit indices and columns
      for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) {

	VECTOR maskV ;
	VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ;

	VECTOR testData ;
	testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE,
				    maskV.VTYPE) ;

	VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ;
	VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ;

#ifdef DEBUG
	if (printDebug && maskIndex == 0) {
	  cout << "The mask for entry 1, mbi=" << mbi << ": "
	       << getBinaryStr<MTYPE>(maskV.VENTRYI[1], 32) << endl ;

	}
#endif // #ifdef DEBUG

#ifdef CHECK_MASK_CORRECTNESS

	int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ;
	int lastRowIndex = min(col-1, numRowsToProcess-1) ;

	for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) {
	  int currRow = beginRowIndex + ri ;
	  int currCol = col - ri + firstRowIndex ;

	  char hapChar = tc.hap[currCol-1] ;
	  char rsChar = tc.rs[currRow-1] ;

	  bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ;

	  if ((bool) testData.VENTRYI[ri] != match) {
	    cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag
		 << " (" << currRow  << ", " << currCol << ")" << endl ;

	    cout << "The chars are: " << hapChar << " and " << rsChar << endl ;
	    cout << "The selected value is: " << testData.VENTRYI[ri] << endl ;

	    exit(0) ;
	  }
	}
#endif // #ifdef CHECK_MASK_CORRECTNESS

	if (diag < COLS)
	  ++col ;

      } // mbi
    } // maskIndex

    beginRowIndex += VEC_ENTRY_CNT ;
  } // end of stripe

  //cout << "Finished validating entry " << endl ;
}

#ifdef HMM_MASK_MAIN
int main () {

  #define BATCH_SIZE 10000

  ConvertChar::init() ;

  testcase* tcBatch = new testcase[BATCH_SIZE] ;

  int numBatches = 0 ;
  int numRead = 0 ;

  const int DEBUG_TC = -1 ;

  FILE* ifp = stdin ;

  do {
    numRead = 0 ;

    while (numRead < BATCH_SIZE) {
      if (read_testcase(tcBatch+numRead, ifp) < 0)
	break ;

      ++numRead ;
    }

    if (numRead == 0)
      break ;

    for (int ti=0; ti < numRead; ++ti) {

      int tcID = numBatches * BATCH_SIZE + ti ;
      test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ;
    }

    ++numBatches ;
  } while (numRead == BATCH_SIZE) ;

  fclose(ifp) ;

  delete[] tcBatch ;

  return 0 ;
}
#endif