#ifdef PRECISION #include #include #include /* template string getBinaryStr (T val, int numBitsToWrite) { ostringstream oss ; uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ; for (int i=numBitsToWrite-1; i >= 0; --i) { oss << ((val & mask) >> i) ; mask >>= 1 ; } return oss.str() ; } */ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { const int maskBitCnt = MAIN_TYPE_SIZE ; for (int vi=0; vi < numMaskVecs; ++vi) { for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { maskArr[vi][rs] = 0 ; } maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; } for (int col=1; col < COLS; ++col) { int mIndex = (col-1) / maskBitCnt ; int mOffset = (col-1) % maskBitCnt ; MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; char hapChar = tc.hap[col-1] ; if (hapChar == AMBIG_CHAR) { for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) maskArr[mIndex][ci] |= bitMask ; } maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ; // bit corresponding to col 1 will be the MSB of the mask 0 // bit corresponding to col 2 will be the MSB-1 of the mask 0 // ... // bit corresponding to col 32 will be the LSB of the mask 0 // bit corresponding to col 33 will be the MSB of the mask 1 // ... } } void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { for (int ri=0; ri < numRowsToProcess; ++ri) { rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; } for (int ei=0; ei < AVX_LENGTH; ++ei) { lastMaskShiftOut[ei] = 0 ; } } #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ __lastShiftOut = __nextShiftOut ; \ } void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) { for (int ei=0; ei < AVX_LENGTH/2; ++ei) { SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], lastMaskShiftOut[ei], ei, maskBitCnt) ; int ei2 = ei + AVX_LENGTH/2 ; // the second entry index SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], lastMaskShiftOut[ei2], ei2, maskBitCnt) ; } } //void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) { inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) { //#define computeDistVec() { _256_TYPE maskV ; VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ; distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ; /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/ VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ; VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ; } /* template struct HmmData { int ROWS ; int COLS ; NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ; Context ctx ; testcase* tc ; _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ; _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ; UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ; UNION_TYPE rs , rsN ; _256_TYPE distmSel; _256_TYPE distm, _1_distm; } ; */ template void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D) { NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); for (int s=0;si[r-1] & 127; int _d = tc->d[r-1] & 127; int _c = tc->c[r-1] & 127; *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; #ifdef DEBUG3 debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1) ),true); debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true); debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1) ),true); debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1) ),true); debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1) ),true); debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1) ),true); #endif //*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; //*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; } NUMBER *ptr_distm1D = (NUMBER *)distm1D; for (int r = 1; r < ROWS; r++) { int _q = tc->q[r-1] & 127; ptr_distm1D[r-1] = ctx.ph2pr[_q]; #ifdef DEBUG3 debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true); #endif } } template inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)( int stripIdx, Context ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY, _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm, _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) { int i = stripIdx; pGAPM = p_GAPM[i]; pMM = p_MM[i]; pMX = p_MX[i]; pXX = p_XX[i]; pMY = p_MY[i]; pYY = p_YY[i]; NUMBER zero = ctx._(0.0); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); #define TRISTATE_CORRECTION_FACTOR 3.0 UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR); /* compare rs and N */ //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH)); //rsN.d = VEC_CMP_EQ(N_packed256, rs); distm = distm1D[i]; _1_distm = VEC_SUB(packed1.d, distm); #ifndef DO_NOT_USE_TRISTATE_CORRECTION distm = VEC_DIV(distm, packed3.d); #endif /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ M_t_2.d = VEC_SET1_VAL(zero); X_t_2.d = VEC_SET1_VAL(zero); if (i==0) { M_t_1.d = VEC_SET1_VAL(zero); X_t_1.d = VEC_SET1_VAL(zero); Y_t_2.d = VEC_SET_LSE(init_Y); Y_t_1.d = VEC_SET1_VAL(zero); } else { X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); Y_t_2.d = VEC_SET1_VAL(zero); Y_t_1.d = VEC_SET1_VAL(zero); } M_t_1_y = M_t_1; } inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, _256_TYPE distm, _256_TYPE _1_distm) { UNION_TYPE hapN, rshap; _256_TYPE cond; IF_32 shiftInHap; int *hap_ptr = tc->ihap; shiftInHap.i = (d NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) { _256_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; _256_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; _256_TYPE distm1D[MAVX_COUNT]; NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH]; UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; struct timeval start, end; NUMBER result_avx2; Context ctx; UNION_TYPE rs , rsN; HAP_TYPE hap; _256_TYPE distmSel, distmChosen ; _256_TYPE distm, _1_distm; int r, c; int ROWS = tc->rslen + 1; int COLS = tc->haplen + 1; int AVX_COUNT = (ROWS+7)/8; NUMBER zero = ctx._(0.0); UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); _256_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); int remainingRows = (ROWS-1) % AVX_LENGTH; int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); const int maskBitCnt = MAIN_TYPE_SIZE ; const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; char rsArr[AVX_LENGTH] ; MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; GEN_INTRINSIC(initializeVectors, PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); //for (int __ii=0; __ii < 10; ++__ii) for (int i=0;i