Pulled Mohammad's changes for creating variable sized arrays

Merge branch 'master' of /home/mghodrat/PairHMM/shared-repository into intel_pairhmm Conflicts: PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
2014-01-26 19:40:43 -08:00 · 2014-01-26 19:40:43 -08:00 · a14a11c0cf
parent 018e9e2c5f e7598dde8b
commit a14a11c0cf
16 changed files with 875 additions and 1048 deletions
--- a/PairHMM_JNI/avx_function_instantiations.cc
+++ b/PairHMM_JNI/avx_function_instantiations.cc
@ -1,10 +1,10 @@
 #include "template.h"

-#undef SIMD_TYPE
-#undef SIMD_TYPE_SSE
+#undef SIMD_ENGINE
+#undef SIMD_ENGINE_SSE

-#define SIMD_TYPE avx
-#define SIMD_TYPE_AVX
+#define SIMD_ENGINE avx
+#define SIMD_ENGINE_AVX

 #include "define-float.h"
 #include "shift_template.c"
--- a/PairHMM_JNI/baseline.cc
+++ b/PairHMM_JNI/baseline.cc
@ -10,10 +10,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)

 	Context<NUMBER> ctx;

-	NUMBER M[MROWS][MCOLS];
-	NUMBER X[MROWS][MCOLS];
-	NUMBER Y[MROWS][MCOLS];
-	NUMBER p[MROWS][6];
+	NUMBER M[ROWS][COLS];
+	NUMBER X[ROWS][COLS];
+	NUMBER Y[ROWS][COLS];
+	NUMBER p[ROWS][6];

 	p[0][MM] = ctx._(0.0);
 	p[0][GapM] = ctx._(0.0);
--- a/PairHMM_JNI/define-double.h
+++ b/PairHMM_JNI/define-double.h
@ -1,53 +1,51 @@
 #include <iostream>

 #ifdef PRECISION
-        #undef PRECISION
-        #undef MAIN_TYPE
-        #undef MAIN_TYPE_SIZE
-        #undef UNION_TYPE
-        #undef IF_128
-        #undef IF_MAIN_TYPE
-        #undef SHIFT_CONST1
-        #undef SHIFT_CONST2
-        #undef SHIFT_CONST3
-        #undef _128_TYPE
-	#undef _256_TYPE
-	#undef AVX_LENGTH
-	#undef MAVX_COUNT
-	#undef HAP_TYPE
-        #undef MASK_TYPE
-        #undef MASK_ALL_ONES
+#undef PRECISION
+#undef MAIN_TYPE
+#undef MAIN_TYPE_SIZE
+#undef UNION_TYPE
+#undef IF_128
+#undef IF_MAIN_TYPE
+#undef SHIFT_CONST1
+#undef SHIFT_CONST2
+#undef SHIFT_CONST3
+#undef _128_TYPE
+#undef SIMD_TYPE
+#undef AVX_LENGTH
+#undef HAP_TYPE
+#undef MASK_TYPE
+#undef MASK_ALL_ONES

-        #undef SET_VEC_ZERO(__vec)
-        #undef VEC_OR(__v1, __v2)
-        #undef VEC_ADD(__v1, __v2)
-        #undef VEC_SUB(__v1, __v2)
-        #undef VEC_MUL(__v1, __v2)
-        #undef VEC_DIV(__v1, __v2)
-        #undef VEC_BLEND(__v1, __v2, __mask)
-        #undef VEC_BLENDV(__v1, __v2, __maskV)
-        #undef VEC_CAST_256_128(__v1)
-        #undef VEC_EXTRACT_128(__v1, __im)
-        #undef VEC_EXTRACT_UNIT(__v1, __im)
-        #undef VEC_SET1_VAL128(__val)
-        #undef VEC_MOVE(__v1, __val)
-        #undef VEC_CAST_128_256(__v1)
-        #undef VEC_INSERT_VAL(__v1, __val, __pos)
-        #undef VEC_CVT_128_256(__v1)
-        #undef VEC_SET1_VAL(__val)
-        #undef VEC_POPCVT_CHAR(__ch)
-        #undef VEC_LDPOPCVT_CHAR(__addr)
-        #undef VEC_CMP_EQ(__v1, __v2)
-        #undef VEC_SET_LSE(__val)
-        #undef SHIFT_HAP(__v1, __val)
-        #undef print256b(__v1)
-        #undef MASK_VEC
-        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-        #undef VEC_SHIFT_LEFT_1BIT(__vs)
-        #undef MASK_ALL_ONES
-        #undef COMPARE_VECS(__v1, __v2)
-        #undef _256_INT_TYPE
-        #undef BITMASK_VEC
+#undef SET_VEC_ZERO(__vec)
+#undef VEC_OR(__v1, __v2)
+#undef VEC_ADD(__v1, __v2)
+#undef VEC_SUB(__v1, __v2)
+#undef VEC_MUL(__v1, __v2)
+#undef VEC_DIV(__v1, __v2)
+#undef VEC_BLEND(__v1, __v2, __mask)
+#undef VEC_BLENDV(__v1, __v2, __maskV)
+#undef VEC_CAST_256_128(__v1)
+#undef VEC_EXTRACT_128(__v1, __im)
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_SET1_VAL128(__val)
+#undef VEC_MOVE(__v1, __val)
+#undef VEC_CAST_128_256(__v1)
+#undef VEC_INSERT_VAL(__v1, __val, __pos)
+#undef VEC_CVT_128_256(__v1)
+#undef VEC_SET1_VAL(__val)
+#undef VEC_POPCVT_CHAR(__ch)
+#undef VEC_LDPOPCVT_CHAR(__addr)
+#undef VEC_CMP_EQ(__v1, __v2)
+#undef VEC_SET_LSE(__val)
+#undef SHIFT_HAP(__v1, __val)
+#undef MASK_VEC
+#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef MASK_ALL_ONES
+#undef COMPARE_VECS(__v1, __v2)
+#undef _256_INT_TYPE
+#undef BITMASK_VEC
 #endif

 #define PRECISION d
@ -60,128 +58,122 @@
 #define SHIFT_CONST2 1
 #define SHIFT_CONST3 8
 #define _128_TYPE __m128d
-#define _256_TYPE __m256d
+#define SIMD_TYPE __m256d
 #define _256_INT_TYPE __m256i
 #define AVX_LENGTH 4
-#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
 #define HAP_TYPE __m128i
 #define MASK_TYPE uint64_t
 #define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF
 #define MASK_VEC MaskVec_D

 #define SET_VEC_ZERO(__vec)                     \
-  __vec= _mm256_setzero_pd()
+    __vec= _mm256_setzero_pd()

 #define VEC_OR(__v1, __v2)                      \
-  _mm256_or_pd(__v1, __v2)
+    _mm256_or_pd(__v1, __v2)

 #define VEC_ADD(__v1, __v2)                     \
-  _mm256_add_pd(__v1, __v2)
+    _mm256_add_pd(__v1, __v2)

 #define VEC_SUB(__v1, __v2)                     \
-  _mm256_sub_pd(__v1, __v2)
+    _mm256_sub_pd(__v1, __v2)

 #define VEC_MUL(__v1, __v2)                     \
-  _mm256_mul_pd(__v1, __v2)
+    _mm256_mul_pd(__v1, __v2)

-#define VEC_DIV(__v1, __v2)			\
-  _mm256_div_pd(__v1, __v2)
+#define VEC_DIV(__v1, __v2)                     \
+    _mm256_div_pd(__v1, __v2)

 #define VEC_BLEND(__v1, __v2, __mask)           \
-  _mm256_blend_pd(__v1, __v2, __mask)
+    _mm256_blend_pd(__v1, __v2, __mask)

 #define VEC_BLENDV(__v1, __v2, __maskV)         \
-  _mm256_blendv_pd(__v1, __v2, __maskV)
+    _mm256_blendv_pd(__v1, __v2, __maskV)

-#define VEC_CAST_256_128(__v1)			\
-  _mm256_castpd256_pd128 (__v1)
+#define VEC_CAST_256_128(__v1)                  \
+    _mm256_castpd256_pd128 (__v1)

-#define VEC_EXTRACT_128(__v1, __im)		\
-  _mm256_extractf128_pd (__v1, __im)
+#define VEC_EXTRACT_128(__v1, __im)             \
+    _mm256_extractf128_pd (__v1, __im)

-#define VEC_EXTRACT_UNIT(__v1, __im)		\
-  _mm_extract_epi64(__v1, __im)
+#define VEC_EXTRACT_UNIT(__v1, __im)            \
+    _mm_extract_epi64(__v1, __im)

-#define VEC_SET1_VAL128(__val)	 		\
-  _mm_set1_pd(__val)
+#define VEC_SET1_VAL128(__val)                  \
+    _mm_set1_pd(__val)

-#define VEC_MOVE(__v1, __val)			\
-  _mm_move_sd(__v1, __val)
+#define VEC_MOVE(__v1, __val)                   \
+    _mm_move_sd(__v1, __val)

-#define VEC_CAST_128_256(__v1)			\
-  _mm256_castpd128_pd256(__v1)		
+#define VEC_CAST_128_256(__v1)                  \
+    _mm256_castpd128_pd256(__v1)

-#define VEC_INSERT_VAL(__v1, __val, __pos)	\
-  _mm256_insertf128_pd(__v1, __val, __pos)
+#define VEC_INSERT_VAL(__v1, __val, __pos)      \
+    _mm256_insertf128_pd(__v1, __val, __pos)

-#define VEC_CVT_128_256(__v1)			\
-  _mm256_cvtepi32_pd(__v1)
+#define VEC_CVT_128_256(__v1)                   \
+    _mm256_cvtepi32_pd(__v1)

-#define VEC_SET1_VAL(__val)			\
-  _mm256_set1_pd(__val)
+#define VEC_SET1_VAL(__val)                     \
+    _mm256_set1_pd(__val)

-#define VEC_POPCVT_CHAR(__ch)			\
-  _mm256_cvtepi32_pd(_mm_set1_epi32(__ch))
+#define VEC_POPCVT_CHAR(__ch)                   \
+    _mm256_cvtepi32_pd(_mm_set1_epi32(__ch))

 #define VEC_LDPOPCVT_CHAR(__addr)               \
-  _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))
+    _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))

-#define VEC_CMP_EQ(__v1, __v2)			\
-  _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)
+#define VEC_CMP_EQ(__v1, __v2)                  \
+    _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)

 #define VEC_SET_LSE(__val)                      \
-  _mm256_set_pd(zero, zero, zero, __val);
+    _mm256_set_pd(zero, zero, zero, __val);

-#define SHIFT_HAP(__v1, __val)			\
-  __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
+#define SHIFT_HAP(__v1, __val)                  \
+    __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)

-#define print256b(__v1)				\
-  print256bDP(__v1)
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
+    __vdst = _mm256_castpd128_pd256(__vsLow) ;            \
+__vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;

-#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
-  __vdst = _mm256_castpd128_pd256(__vsLow) ;		\
-  __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;
-
-#define VEC_SHIFT_LEFT_1BIT(__vs)		\
-  __vs = _mm_slli_epi64(__vs, 1) 
+#define VEC_SHIFT_LEFT_1BIT(__vs)               \
+    __vs = _mm_slli_epi64(__vs, 1)


-#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
-    double* ptr1 = (double*) (&__v1) ;					\
-    double* ptr2 = (double*) (&__v2) ;					\
-    for (int ei=__first; ei <= __last; ++ei) {				\
-      if (ptr1[ei] != ptr2[ei]) {					\
-	std::cout << "Double Mismatch at " << ei << ": "		\
-		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
-	exit(0) ;							\
-      }									\
-    }									\
-  }
+#define COMPARE_VECS(__v1, __v2, __first, __last) {                     \
+    double* ptr1 = (double*) (&__v1) ;                                  \
+    double* ptr2 = (double*) (&__v2) ;                                  \
+    for (int ei=__first; ei <= __last; ++ei) {                          \
+        if (ptr1[ei] != ptr2[ei]) {                                       \
+            std::cout << "Double Mismatch at " << ei << ": "                \
+            << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;     \
+            exit(0) ;                                                       \
+        }                                                                 \
+    }                                                                   \
+}

 class BitMaskVec_double {

-  MASK_VEC low_, high_ ;
-  _256_TYPE combined_ ;
+    MASK_VEC low_, high_ ;
+    SIMD_TYPE combined_ ;

-public:
-  
-  inline MASK_TYPE& getLowEntry(int index) {
-    return low_.masks[index] ;
-  }
-  inline MASK_TYPE& getHighEntry(int index) {
-    return high_.masks[index] ;
-  }
-  
-  inline const _256_TYPE& getCombinedMask() {
-    VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+    public:
+    inline MASK_TYPE& getLowEntry(int index) {
+        return low_.masks[index] ;
+    }
+    inline MASK_TYPE& getHighEntry(int index) {
+        return high_.masks[index] ;
+    }

-    return combined_ ;
-  }
-  
-  inline void shift_left_1bit() {
-    VEC_SHIFT_LEFT_1BIT(low_.vec) ;
-    VEC_SHIFT_LEFT_1BIT(high_.vec) ;
-  }
+    inline const SIMD_TYPE& getCombinedMask() {
+        VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+        return combined_ ;
+    }
+
+    inline void shift_left_1bit() {
+        VEC_SHIFT_LEFT_1BIT(low_.vec) ;
+        VEC_SHIFT_LEFT_1BIT(high_.vec) ;
+    }

 } ;

--- a/PairHMM_JNI/define-float.h
+++ b/PairHMM_JNI/define-float.h
@ -1,53 +1,51 @@
 #include <iostream>

 #ifdef PRECISION
-	#undef PRECISION
-        #undef MAIN_TYPE
-        #undef MAIN_TYPE_SIZE
-        #undef UNION_TYPE
-        #undef IF_128
-        #undef IF_MAIN_TYPE
-        #undef SHIFT_CONST1
-        #undef SHIFT_CONST2
-        #undef SHIFT_CONST3
-        #undef _128_TYPE
-        #undef _256_TYPE
-        #undef AVX_LENGTH
-        #undef MAVX_COUNT
-	#undef HAP_TYPE
-        #undef MASK_TYPE
-        #undef MASK_ALL_ONES
+#undef PRECISION
+#undef MAIN_TYPE
+#undef MAIN_TYPE_SIZE
+#undef UNION_TYPE
+#undef IF_128
+#undef IF_MAIN_TYPE
+#undef SHIFT_CONST1
+#undef SHIFT_CONST2
+#undef SHIFT_CONST3
+#undef _128_TYPE
+#undef SIMD_TYPE
+#undef AVX_LENGTH
+#undef HAP_TYPE
+#undef MASK_TYPE
+#undef MASK_ALL_ONES

-	#undef SET_VEC_ZERO(__vec)
-	#undef VEC_OR(__v1, __v2)
-	#undef VEC_ADD(__v1, __v2)
-        #undef VEC_SUB(__v1, __v2)
-	#undef VEC_MUL(__v1, __v2)
-	#undef VEC_DIV(__v1, __v2)
-	#undef VEC_BLEND(__v1, __v2, __mask)
-	#undef VEC_BLENDV(__v1, __v2, __maskV)
-        #undef VEC_CAST_256_128(__v1)
-        #undef VEC_EXTRACT_128(__v1, __im)
-        #undef VEC_EXTRACT_UNIT(__v1, __im)
-        #undef VEC_SET1_VAL128(__val)
-        #undef VEC_MOVE(__v1, __val)
-        #undef VEC_CAST_128_256(__v1)
-        #undef VEC_INSERT_VAL(__v1, __val, __pos)
-	#undef VEC_CVT_128_256(__v1)	
-	#undef VEC_SET1_VAL(__val)
-	#undef VEC_POPCVT_CHAR(__ch)
-	#undef VEC_LDPOPCVT_CHAR(__addr)
-	#undef VEC_CMP_EQ(__v1, __v2)
-	#undef VEC_SET_LSE(__val)
-	#undef SHIFT_HAP(__v1, __val)
-	#undef print256b(__v1)
-        #undef MASK_VEC
-        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-        #undef VEC_SHIFT_LEFT_1BIT(__vs)
-        #undef MASK_ALL_ONES
-        #undef COMPARE_VECS(__v1, __v2)
-        #undef _256_INT_TYPE
-        #undef BITMASK_VEC
+#undef SET_VEC_ZERO(__vec)
+#undef VEC_OR(__v1, __v2)
+#undef VEC_ADD(__v1, __v2)
+#undef VEC_SUB(__v1, __v2)
+#undef VEC_MUL(__v1, __v2)
+#undef VEC_DIV(__v1, __v2)
+#undef VEC_BLEND(__v1, __v2, __mask)
+#undef VEC_BLENDV(__v1, __v2, __maskV)
+#undef VEC_CAST_256_128(__v1)
+#undef VEC_EXTRACT_128(__v1, __im)
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_SET1_VAL128(__val)
+#undef VEC_MOVE(__v1, __val)
+#undef VEC_CAST_128_256(__v1)
+#undef VEC_INSERT_VAL(__v1, __val, __pos)
+#undef VEC_CVT_128_256(__v1)
+#undef VEC_SET1_VAL(__val)
+#undef VEC_POPCVT_CHAR(__ch)
+#undef VEC_LDPOPCVT_CHAR(__addr)
+#undef VEC_CMP_EQ(__v1, __v2)
+#undef VEC_SET_LSE(__val)
+#undef SHIFT_HAP(__v1, __val)
+#undef MASK_VEC
+#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef MASK_ALL_ONES
+#undef COMPARE_VECS(__v1, __v2)
+#undef _256_INT_TYPE
+#undef BITMASK_VEC
 #endif

 #define PRECISION s
@ -61,127 +59,122 @@
 #define SHIFT_CONST2 3
 #define SHIFT_CONST3 4
 #define _128_TYPE __m128
-#define _256_TYPE __m256
+#define SIMD_TYPE __m256
 #define _256_INT_TYPE __m256i
 #define AVX_LENGTH 8
-#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
 #define HAP_TYPE UNION_TYPE
 #define MASK_TYPE uint32_t
 #define MASK_ALL_ONES 0xFFFFFFFF
 #define MASK_VEC MaskVec_F

 #define SET_VEC_ZERO(__vec)                     \
-  __vec= _mm256_setzero_ps()
+    __vec= _mm256_setzero_ps()

 #define VEC_OR(__v1, __v2)                      \
-  _mm256_or_ps(__v1, __v2)
+    _mm256_or_ps(__v1, __v2)

 #define VEC_ADD(__v1, __v2)                     \
-  _mm256_add_ps(__v1, __v2)
+    _mm256_add_ps(__v1, __v2)

 #define VEC_SUB(__v1, __v2)                     \
-  _mm256_sub_ps(__v1, __v2)
+    _mm256_sub_ps(__v1, __v2)

 #define VEC_MUL(__v1, __v2)                     \
-  _mm256_mul_ps(__v1, __v2)
+    _mm256_mul_ps(__v1, __v2)

-#define VEC_DIV(__v1, __v2)			\
-  _mm256_div_ps(__v1, __v2)
+#define VEC_DIV(__v1, __v2)                     \
+    _mm256_div_ps(__v1, __v2)

 #define VEC_BLEND(__v1, __v2, __mask)           \
-  _mm256_blend_ps(__v1, __v2, __mask)
+    _mm256_blend_ps(__v1, __v2, __mask)

 #define VEC_BLENDV(__v1, __v2, __maskV)         \
-  _mm256_blendv_ps(__v1, __v2, __maskV)
+    _mm256_blendv_ps(__v1, __v2, __maskV)

-#define VEC_CAST_256_128(__v1)			\
-  _mm256_castps256_ps128 (__v1)
+#define VEC_CAST_256_128(__v1)                  \
+    _mm256_castps256_ps128 (__v1)

 #define VEC_EXTRACT_128(__v1, __im)             \
-  _mm256_extractf128_ps (__v1, __im)
+    _mm256_extractf128_ps (__v1, __im)

-#define VEC_EXTRACT_UNIT(__v1, __im)		\
-  _mm_extract_epi32(__v1, __im)
+#define VEC_EXTRACT_UNIT(__v1, __im)            \
+    _mm_extract_epi32(__v1, __im)

-#define VEC_SET1_VAL128(__val)			\
-  _mm_set1_ps(__val)
+#define VEC_SET1_VAL128(__val)                  \
+    _mm_set1_ps(__val)

-#define VEC_MOVE(__v1, __val)			\
-  _mm_move_ss(__v1, __val)
+#define VEC_MOVE(__v1, __val)                   \
+    _mm_move_ss(__v1, __val)

 #define VEC_CAST_128_256(__v1)                  \
-  _mm256_castps128_ps256(__v1)
+    _mm256_castps128_ps256(__v1)

 #define VEC_INSERT_VAL(__v1, __val, __pos)      \
-  _mm256_insertf128_ps(__v1, __val, __pos)
+    _mm256_insertf128_ps(__v1, __val, __pos)

 #define VEC_CVT_128_256(__v1)                   \
-  _mm256_cvtepi32_ps(__v1.i)
+    _mm256_cvtepi32_ps(__v1.i)

-#define VEC_SET1_VAL(__val)                 	\
-  _mm256_set1_ps(__val)
+#define VEC_SET1_VAL(__val)                     \
+    _mm256_set1_ps(__val)

 #define VEC_POPCVT_CHAR(__ch)                   \
-  _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))
+    _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))

-#define VEC_LDPOPCVT_CHAR(__addr)		\
-   _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))
+#define VEC_LDPOPCVT_CHAR(__addr)               \
+    _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))

 #define VEC_CMP_EQ(__v1, __v2)                  \
-  _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)
+    _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)

-#define VEC_SET_LSE(__val)			\
-  _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
+#define VEC_SET_LSE(__val)                      \
+    _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);

-#define SHIFT_HAP(__v1, __val)			\
-  _vector_shift_lastavxs(__v1, __val.f);
+#define SHIFT_HAP(__v1, __val)                  \
+    _vector_shift_lastavxs(__v1, __val.f);

-#define print256b(__v1)                         \
-  print256bFP(__v1)
-  
-#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
-  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
-  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
+    __vdst = _mm256_castps128_ps256(__vsLow) ;            \
+__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;

-#define VEC_SHIFT_LEFT_1BIT(__vs)		\
-  __vs = _mm_slli_epi32(__vs, 1) 
+#define VEC_SHIFT_LEFT_1BIT(__vs)               \
+    __vs = _mm_slli_epi32(__vs, 1)

-#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
-    float* ptr1 = (float*) (&__v1) ;					\
-    float* ptr2 = (float*) (&__v2) ;					\
-    for (int ei=__first; ei <= __last; ++ei) {				\
-      if (ptr1[ei] != ptr2[ei]) {					\
-	std::cout << "Float Mismatch at " << ei << ": "			\
-		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
-	exit(0) ;							\
-      }									\
-    }									\
-  }
+#define COMPARE_VECS(__v1, __v2, __first, __last) {                     \
+    float* ptr1 = (float*) (&__v1) ;                                    \
+    float* ptr2 = (float*) (&__v2) ;                                    \
+    for (int ei=__first; ei <= __last; ++ei) {                          \
+        if (ptr1[ei] != ptr2[ei]) {                                       \
+            std::cout << "Float Mismatch at " << ei << ": "                 \
+            << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;     \
+            exit(0) ;                                                       \
+        }                                                                 \
+    }                                                                   \
+}

 class BitMaskVec_float {

-  MASK_VEC low_, high_ ;
-  _256_TYPE combined_ ;
+    MASK_VEC low_, high_ ;
+    SIMD_TYPE combined_ ;

-public:
-  
-  inline MASK_TYPE& getLowEntry(int index) {
-    return low_.masks[index] ;
-  }
-  inline MASK_TYPE& getHighEntry(int index) {
-    return high_.masks[index] ;
-  }
-  
-  inline const _256_TYPE& getCombinedMask() {
-    VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+    public:

-    return combined_ ;
-  }
-  
-  inline void shift_left_1bit() {
-    VEC_SHIFT_LEFT_1BIT(low_.vec) ;
-    VEC_SHIFT_LEFT_1BIT(high_.vec) ;
-  }
+    inline MASK_TYPE& getLowEntry(int index) {
+        return low_.masks[index] ;
+    }
+    inline MASK_TYPE& getHighEntry(int index) {
+        return high_.masks[index] ;
+    }
+
+    inline const SIMD_TYPE& getCombinedMask() {
+        VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+        return combined_ ;
+    }
+
+    inline void shift_left_1bit() {
+        VEC_SHIFT_LEFT_1BIT(low_.vec) ;
+        VEC_SHIFT_LEFT_1BIT(high_.vec) ;
+    }

 } ;

--- a/PairHMM_JNI/define-sse-double.h
+++ b/PairHMM_JNI/define-sse-double.h
@ -1,53 +1,51 @@
 #ifdef PRECISION
-        #undef PRECISION
-        #undef MAIN_TYPE
-        #undef MAIN_TYPE_SIZE
-        #undef UNION_TYPE
-        #undef IF_128
-        #undef IF_MAIN_TYPE
-        #undef SHIFT_CONST1
-        #undef SHIFT_CONST2
-        #undef SHIFT_CONST3
-        #undef _128_TYPE
-        #undef _256_TYPE
-        #undef AVX_LENGTH
-        #undef MAVX_COUNT
-        #undef HAP_TYPE
-        #undef MASK_TYPE
-        #undef MASK_ALL_ONES
+#undef PRECISION
+#undef MAIN_TYPE
+#undef MAIN_TYPE_SIZE
+#undef UNION_TYPE
+#undef IF_128
+#undef IF_MAIN_TYPE
+#undef SHIFT_CONST1
+#undef SHIFT_CONST2
+#undef SHIFT_CONST3
+#undef _128_TYPE
+#undef SIMD_TYPE
+#undef AVX_LENGTH
+#undef HAP_TYPE
+#undef MASK_TYPE
+#undef MASK_ALL_ONES

-	#undef VEC_EXTRACT_UNIT(__v1, __im)
-	#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-        #undef SET_VEC_ZERO(__vec)
-        #undef VEC_OR(__v1, __v2)
-        #undef VEC_ADD(__v1, __v2)
-        #undef VEC_SUB(__v1, __v2)
-        #undef VEC_MUL(__v1, __v2)
-	#undef VEC_DIV(__v1, __v2)
-        #undef VEC_BLEND(__v1, __v2, __mask)
-        #undef VEC_BLENDV(__v1, __v2, __maskV)
-        #undef VEC_CAST_256_128(__v1)
-        #undef VEC_EXTRACT_128(__v1, __im)
-        #undef VEC_EXTRACT_UNIT(__v1, __im)
-        #undef VEC_SET1_VAL128(__val)
-        #undef VEC_MOVE(__v1, __val)
-        #undef VEC_CAST_128_256(__v1)
-        #undef VEC_INSERT_VAL(__v1, __val, __pos)
-        #undef VEC_CVT_128_256(__v1)
-        #undef VEC_SET1_VAL(__val)
-        #undef VEC_POPCVT_CHAR(__ch)
-        #undef VEC_LDPOPCVT_CHAR(__addr)
-        #undef VEC_CMP_EQ(__v1, __v2)
-        #undef VEC_SET_LSE(__val)
-        #undef SHIFT_HAP(__v1, __val)
-        #undef print256b(__v1)
-        #undef MASK_VEC
-        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-        #undef VEC_SHIFT_LEFT_1BIT(__vs)
-        #undef MASK_ALL_ONES
-        #undef COMPARE_VECS(__v1, __v2)
-        #undef _256_INT_TYPE
-        #undef BITMASK_VEC
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_INSERT_UNIT(__v1,__ins,__im)
+#undef SET_VEC_ZERO(__vec)
+#undef VEC_OR(__v1, __v2)
+#undef VEC_ADD(__v1, __v2)
+#undef VEC_SUB(__v1, __v2)
+#undef VEC_MUL(__v1, __v2)
+#undef VEC_DIV(__v1, __v2)
+#undef VEC_BLEND(__v1, __v2, __mask)
+#undef VEC_BLENDV(__v1, __v2, __maskV)
+#undef VEC_CAST_256_128(__v1)
+#undef VEC_EXTRACT_128(__v1, __im)
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_SET1_VAL128(__val)
+#undef VEC_MOVE(__v1, __val)
+#undef VEC_CAST_128_256(__v1)
+#undef VEC_INSERT_VAL(__v1, __val, __pos)
+#undef VEC_CVT_128_256(__v1)
+#undef VEC_SET1_VAL(__val)
+#undef VEC_POPCVT_CHAR(__ch)
+#undef VEC_LDPOPCVT_CHAR(__addr)
+#undef VEC_CMP_EQ(__v1, __v2)
+#undef VEC_SET_LSE(__val)
+#undef SHIFT_HAP(__v1, __val)
+#undef MASK_VEC
+#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef MASK_ALL_ONES
+#undef COMPARE_VECS(__v1, __v2)
+#undef _256_INT_TYPE
+#undef BITMASK_VEC
 #endif

 #define SSE
@ -62,90 +60,87 @@
 #define SHIFT_CONST2 8
 #define SHIFT_CONST3 0
 #define _128_TYPE __m128d
-#define _256_TYPE __m128d
+#define SIMD_TYPE __m128d
 #define _256_INT_TYPE __m128i
 #define AVX_LENGTH 2
-#define MAVX_COUNT  (MROWS+3)/AVX_LENGTH
 #define HAP_TYPE __m128i
 #define MASK_TYPE uint64_t
 #define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL
 #define MASK_VEC MaskVec_D

 #define VEC_EXTRACT_UNIT(__v1, __im)            \
-  _mm_extract_epi64(__v1, __im)
+    _mm_extract_epi64(__v1, __im)

-#define VEC_INSERT_UNIT(__v1,__ins,__im)	\
-   _mm_insert_epi64(__v1,__ins,__im)
+#define VEC_INSERT_UNIT(__v1,__ins,__im)        \
+    _mm_insert_epi64(__v1,__ins,__im)

 #define VEC_OR(__v1, __v2)                      \
-  _mm_or_pd(__v1, __v2)
+    _mm_or_pd(__v1, __v2)

 #define VEC_ADD(__v1, __v2)                     \
-  _mm_add_pd(__v1, __v2)
+    _mm_add_pd(__v1, __v2)

 #define VEC_SUB(__v1, __v2)                     \
-  _mm_sub_pd(__v1, __v2)
+    _mm_sub_pd(__v1, __v2)

 #define VEC_MUL(__v1, __v2)                     \
-  _mm_mul_pd(__v1, __v2)
+    _mm_mul_pd(__v1, __v2)

 #define VEC_DIV(__v1, __v2)                     \
-  _mm_div_pd(__v1, __v2)
+    _mm_div_pd(__v1, __v2)

 #define VEC_CMP_EQ(__v1, __v2)                  \
-  _mm_cmpeq_pd(__v1, __v2)
+    _mm_cmpeq_pd(__v1, __v2)

 #define VEC_BLEND(__v1, __v2, __mask)           \
-  _mm_blend_pd(__v1, __v2, __mask)
+    _mm_blend_pd(__v1, __v2, __mask)

 #define VEC_BLENDV(__v1, __v2, __maskV)         \
-  _mm_blendv_pd(__v1, __v2, __maskV)
+    _mm_blendv_pd(__v1, __v2, __maskV)

 #define SHIFT_HAP(__v1, __val)                  \
-  __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
+    __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)

 #define VEC_CVT_128_256(__v1)                   \
-  _mm_cvtepi32_pd(__v1)
+    _mm_cvtepi32_pd(__v1)
+
+#define VEC_SET1_VAL(__val)                     \
+    _mm_set1_pd(__val)

-#define VEC_SET1_VAL(__val)			\
-  _mm_set1_pd(__val)
-   
 #define VEC_POPCVT_CHAR(__ch)                   \
-  _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) 
+    _mm_cvtepi32_pd(_mm_set1_epi32(__ch))

 #define VEC_SET_LSE(__val)                      \
-  _mm_set_pd(zero, __val);  
+    _mm_set_pd(zero, __val);

 #define VEC_LDPOPCVT_CHAR(__addr)               \
-   _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) 
+    _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr))

 #define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
-   __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
+    __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))

 #define VEC_SHIFT_LEFT_1BIT(__vs)               \
-  __vs = _mm_slli_epi64(__vs, 1)
+    __vs = _mm_slli_epi64(__vs, 1)


 class BitMaskVec_sse_double {

-  MASK_VEC combined_ ;
+    MASK_VEC combined_ ;
+    public:
+    inline MASK_TYPE& getLowEntry(int index) {
+        return combined_.masks[index] ;
+    }
+    inline MASK_TYPE& getHighEntry(int index) {
+        return combined_.masks[AVX_LENGTH/2+index] ;
+    }

-public:
-  
-  inline MASK_TYPE& getLowEntry(int index) {
-    return combined_.masks[index] ;
-  }
-  inline MASK_TYPE& getHighEntry(int index) {
-    return combined_.masks[AVX_LENGTH/2+index] ;
-  }
-  
-  inline const _256_TYPE& getCombinedMask() {
-    return combined_.vecf ;
-  }
-  
-  inline void shift_left_1bit() {
-    VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
-  }
+    inline const SIMD_TYPE& getCombinedMask() {
+        return combined_.vecf ;
+    }
+
+    inline void shift_left_1bit() {
+        VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
+    }

 } ;

--- a/PairHMM_JNI/define-sse-float.h
+++ b/PairHMM_JNI/define-sse-float.h
@ -1,53 +1,51 @@
 #ifdef PRECISION
-        #undef PRECISION
-        #undef MAIN_TYPE
-        #undef MAIN_TYPE_SIZE
-        #undef UNION_TYPE
-        #undef IF_128
-        #undef IF_MAIN_TYPE
-        #undef SHIFT_CONST1
-        #undef SHIFT_CONST2
-        #undef SHIFT_CONST3
-        #undef _128_TYPE
-        #undef _256_TYPE
-        #undef AVX_LENGTH
-        #undef MAVX_COUNT
-        #undef HAP_TYPE
-        #undef MASK_TYPE
-        #undef MASK_ALL_ONES
+#undef PRECISION
+#undef MAIN_TYPE
+#undef MAIN_TYPE_SIZE
+#undef UNION_TYPE
+#undef IF_128
+#undef IF_MAIN_TYPE
+#undef SHIFT_CONST1
+#undef SHIFT_CONST2
+#undef SHIFT_CONST3
+#undef _128_TYPE
+#undef SIMD_TYPE
+#undef AVX_LENGTH
+#undef HAP_TYPE
+#undef MASK_TYPE
+#undef MASK_ALL_ONES

-	#undef VEC_EXTRACT_UNIT(__v1, __im)
-	#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-        #undef SET_VEC_ZERO(__vec)
-        #undef VEC_OR(__v1, __v2)
-        #undef VEC_ADD(__v1, __v2)
-        #undef VEC_SUB(__v1, __v2)
-        #undef VEC_MUL(__v1, __v2)
-	#undef VEC_DIV(__v1, __v2)
-        #undef VEC_BLEND(__v1, __v2, __mask)
-        #undef VEC_BLENDV(__v1, __v2, __maskV)
-        #undef VEC_CAST_256_128(__v1)
-        #undef VEC_EXTRACT_128(__v1, __im)
-        #undef VEC_EXTRACT_UNIT(__v1, __im)
-        #undef VEC_SET1_VAL128(__val)
-        #undef VEC_MOVE(__v1, __val)
-        #undef VEC_CAST_128_256(__v1)
-        #undef VEC_INSERT_VAL(__v1, __val, __pos)
-        #undef VEC_CVT_128_256(__v1)
-        #undef VEC_SET1_VAL(__val)
-        #undef VEC_POPCVT_CHAR(__ch)
-        #undef VEC_LDPOPCVT_CHAR(__addr)
-        #undef VEC_CMP_EQ(__v1, __v2)
-        #undef VEC_SET_LSE(__val)
-        #undef SHIFT_HAP(__v1, __val)
-        #undef print256b(__v1)
-        #undef MASK_VEC
-        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-        #undef VEC_SHIFT_LEFT_1BIT(__vs)
-        #undef MASK_ALL_ONES
-        #undef COMPARE_VECS(__v1, __v2)
-        #undef _256_INT_TYPE
-        #undef BITMASK_VEC
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_INSERT_UNIT(__v1,__ins,__im)
+#undef SET_VEC_ZERO(__vec)
+#undef VEC_OR(__v1, __v2)
+#undef VEC_ADD(__v1, __v2)
+#undef VEC_SUB(__v1, __v2)
+#undef VEC_MUL(__v1, __v2)
+#undef VEC_DIV(__v1, __v2)
+#undef VEC_BLEND(__v1, __v2, __mask)
+#undef VEC_BLENDV(__v1, __v2, __maskV)
+#undef VEC_CAST_256_128(__v1)
+#undef VEC_EXTRACT_128(__v1, __im)
+#undef VEC_EXTRACT_UNIT(__v1, __im)
+#undef VEC_SET1_VAL128(__val)
+#undef VEC_MOVE(__v1, __val)
+#undef VEC_CAST_128_256(__v1)
+#undef VEC_INSERT_VAL(__v1, __val, __pos)
+#undef VEC_CVT_128_256(__v1)
+#undef VEC_SET1_VAL(__val)
+#undef VEC_POPCVT_CHAR(__ch)
+#undef VEC_LDPOPCVT_CHAR(__addr)
+#undef VEC_CMP_EQ(__v1, __v2)
+#undef VEC_SET_LSE(__val)
+#undef SHIFT_HAP(__v1, __val)
+#undef MASK_VEC
+#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef MASK_ALL_ONES
+#undef COMPARE_VECS(__v1, __v2)
+#undef _256_INT_TYPE
+#undef BITMASK_VEC
 #endif

 #define SSE
@ -62,89 +60,88 @@
 #define SHIFT_CONST2 4
 #define SHIFT_CONST3 0
 #define _128_TYPE __m128
-#define _256_TYPE __m128
+#define SIMD_TYPE __m128
 #define _256_INT_TYPE __m128i
 #define AVX_LENGTH 4
-#define MAVX_COUNT  (MROWS+3)/AVX_LENGTH
+//#define MAVX_COUNT  (MROWS+3)/AVX_LENGTH
 #define HAP_TYPE UNION_TYPE
 #define MASK_TYPE uint32_t
 #define MASK_ALL_ONES 0xFFFFFFFF
 #define MASK_VEC MaskVec_F

 #define VEC_EXTRACT_UNIT(__v1, __im)            \
-  _mm_extract_epi32(__v1, __im)
+    _mm_extract_epi32(__v1, __im)

-#define VEC_INSERT_UNIT(__v1,__ins,__im)	\
-   _mm_insert_epi32(__v1,__ins,__im)
+#define VEC_INSERT_UNIT(__v1,__ins,__im)        \
+    _mm_insert_epi32(__v1,__ins,__im)

 #define VEC_OR(__v1, __v2)                      \
-  _mm_or_ps(__v1, __v2)
+    _mm_or_ps(__v1, __v2)

 #define VEC_ADD(__v1, __v2)                     \
-  _mm_add_ps(__v1, __v2)
+    _mm_add_ps(__v1, __v2)

 #define VEC_SUB(__v1, __v2)                     \
-  _mm_sub_ps(__v1, __v2)
+    _mm_sub_ps(__v1, __v2)

 #define VEC_MUL(__v1, __v2)                     \
-  _mm_mul_ps(__v1, __v2)
+    _mm_mul_ps(__v1, __v2)

 #define VEC_DIV(__v1, __v2)                     \
-  _mm_div_ps(__v1, __v2)
+    _mm_div_ps(__v1, __v2)

 #define VEC_CMP_EQ(__v1, __v2)                  \
-  _mm_cmpeq_ps(__v1, __v2)
+    _mm_cmpeq_ps(__v1, __v2)

 #define VEC_BLEND(__v1, __v2, __mask)           \
-  _mm_blend_ps(__v1, __v2, __mask)
+    _mm_blend_ps(__v1, __v2, __mask)

 #define VEC_BLENDV(__v1, __v2, __maskV)         \
-  _mm_blendv_ps(__v1, __v2, __maskV)
+    _mm_blendv_ps(__v1, __v2, __maskV)

 #define SHIFT_HAP(__v1, __val)                  \
-  _vector_shift_lastsses(__v1, __val.f)
+    _vector_shift_lastsses(__v1, __val.f)

 #define VEC_CVT_128_256(__v1)                   \
-  _mm_cvtepi32_ps(__v1.i)
+    _mm_cvtepi32_ps(__v1.i)
+
+#define VEC_SET1_VAL(__val)                     \
+    _mm_set1_ps(__val)

-#define VEC_SET1_VAL(__val)			\
-  _mm_set1_ps(__val)
-   
 #define VEC_POPCVT_CHAR(__ch)                   \
-  _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) 
+    _mm_cvtepi32_ps(_mm_set1_epi32(__ch))

 #define VEC_SET_LSE(__val)                      \
-  _mm_set_ps(zero, zero, zero, __val);  
+    _mm_set_ps(zero, zero, zero, __val);

 #define VEC_LDPOPCVT_CHAR(__addr)               \
-   _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) 
+    _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr))

 #define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
-   __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
+    __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)

 #define VEC_SHIFT_LEFT_1BIT(__vs)               \
-  __vs = _mm_slli_epi32(__vs, 1)
+    __vs = _mm_slli_epi32(__vs, 1)

 class BitMaskVec_sse_float {

-  MASK_VEC combined_ ;
+    MASK_VEC combined_ ;

-public:
-  
-  inline MASK_TYPE& getLowEntry(int index) {
-    return combined_.masks[index] ;
-  }
-  inline MASK_TYPE& getHighEntry(int index) {
-    return combined_.masks[AVX_LENGTH/2+index] ;
-  }
-  
-  inline const _256_TYPE& getCombinedMask() {
-    return combined_.vecf ;
-  }
-  
-  inline void shift_left_1bit() {
-    VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
-  }
+    public:
+    inline MASK_TYPE& getLowEntry(int index) {
+        return combined_.masks[index] ;
+    }
+    inline MASK_TYPE& getHighEntry(int index) {
+        return combined_.masks[AVX_LENGTH/2+index] ;
+    }
+
+    inline const SIMD_TYPE& getCombinedMask() {
+        return combined_.vecf ;
+    }
+
+    inline void shift_left_1bit() {
+        VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
+    }

 } ;

--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
@ -68,7 +68,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
    haplotypeBasesLength = env->GetArrayLength(haplotypeBasesGlobalRef);
 #ifdef ENABLE_ASSERTIONS
    assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
-    assert(haplotypeBasesLength < MCOLS);
+    //assert(haplotypeBasesLength < MCOLS);
 #endif
 #ifdef DEBUG0_1
    cout << "JNI haplotype length "<<haplotypeBasesLength<<"\n";
@ -145,7 +145,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLogless
    assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
    assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
    assert(overallGCPArray && "overallGCP array not initialized in JNI");
-    assert(readLength < MROWS); 
+    //assert(readLength < MROWS); 
    assert(readLength == env->GetArrayLength(readQuals));
    assert(readLength == env->GetArrayLength(insertionGOP));
    assert(readLength == env->GetArrayLength(deletionGOP));
--- a/PairHMM_JNI/pairhmm-template-kernel.cc
+++ b/PairHMM_JNI/pairhmm-template-kernel.cc
@ -4,435 +4,338 @@
 #include <assert.h>
 #include <stdlib.h>

-//#define DEBUG
-#define MUSTAFA
-#define KARTHIK

-/*
-template <class T>
-string getBinaryStr (T val, int numBitsToWrite) {
-  
-  ostringstream oss ;
-  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
-  for (int i=numBitsToWrite-1; i >= 0; --i) {
-    oss << ((val & mask) >> i) ;
-    mask >>= 1 ;
-  }
-  return oss.str() ;
-}
-*/
-#ifdef MUSTAFA
+void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {

+    const int maskBitCnt = MAIN_TYPE_SIZE ;

-void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
-  
-  const int maskBitCnt = MAIN_TYPE_SIZE ;
-
-  for (int vi=0; vi < numMaskVecs; ++vi) {
-    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
-      maskArr[vi][rs] = 0 ;
+    for (int vi=0; vi < numMaskVecs; ++vi) {
+        for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
+            maskArr[vi][rs] = 0 ;
+        }
+        maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
    }
-    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
-  }
- 
-  for (int col=1; col < COLS; ++col) {
-    int mIndex = (col-1) / maskBitCnt ;
-    int mOffset = (col-1) % maskBitCnt ;
-    MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;

-    char hapChar = ConvertChar::get(tc.hap[col-1]);
+    for (int col=1; col < COLS; ++col) {
+        int mIndex = (col-1) / maskBitCnt ;
+        int mOffset = (col-1) % maskBitCnt ;
+        MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;

-    if (hapChar == AMBIG_CHAR) {
-      for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) 
-	maskArr[mIndex][ci] |= bitMask ;
-    } 
+        char hapChar = ConvertChar::get(tc.hap[col-1]);

-    maskArr[mIndex][hapChar] |= bitMask ;
-    // bit corresponding to col 1 will be the MSB of the mask 0
-    // bit corresponding to col 2 will be the MSB-1 of the mask 0
-    // ...
-    // bit corresponding to col 32 will be the LSB of the mask 0
-    // bit corresponding to col 33 will be the MSB of the mask 1
-    // ...
-  }
+        if (hapChar == AMBIG_CHAR) {
+            for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci)
+                maskArr[mIndex][ci] |= bitMask ;
+        }
+
+        maskArr[mIndex][hapChar] |= bitMask ;
+        // bit corresponding to col 1 will be the MSB of the mask 0
+        // bit corresponding to col 2 will be the MSB-1 of the mask 0
+        // ...
+        // bit corresponding to col 32 will be the LSB of the mask 0
+        // bit corresponding to col 33 will be the MSB of the mask 1
+        // ...
+    }

 }

-void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
+void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {

-  for (int ri=0; ri < numRowsToProcess; ++ri) {
-    rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
-  }
+    for (int ri=0; ri < numRowsToProcess; ++ri) {
+        rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
+    }

-  for (int ei=0; ei < AVX_LENGTH; ++ei) {
-    lastMaskShiftOut[ei] = 0 ;
-  }
+    for (int ei=0; ei < AVX_LENGTH; ++ei) {
+        lastMaskShiftOut[ei] = 0 ;
+    }
 }

-
 #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
-  MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ;			\
-  MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
-  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
-  __lastShiftOut = __nextShiftOut ;					\
+    MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ;            \
+    MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+    __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;        \
+    __lastShiftOut = __nextShiftOut ;                    \
 }


-void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
+void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {

-  for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
-    SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], 
-		  lastMaskShiftOut[ei], ei, maskBitCnt) ;
-    
-    int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
-    SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], 
-		  lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
-  }
+    for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
+        SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]],
+                lastMaskShiftOut[ei], ei, maskBitCnt) ;
+
+        int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
+        SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]],
+                lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+    }

 }


-//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
+inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (BITMASK_VEC& bitMaskVec, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen) {

-inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
-  //#define computeDistVec() {					      
+    distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;

-#ifdef DEBUGG
-        long long *temp1 = (long long *)(&maskV);
-        double *temp2 = (double *)(&distm);
-        double *temp3 = (double *)(&_1_distm);
-        printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
-#endif
+    bitMaskVec.shift_left_1bit() ;
+}

-  distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
-
-  /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
-
-  bitMaskVec.shift_left_1bit() ;
-}								     
-
-
-/*
-template<class NUMBER>
-struct HmmData {
-  int ROWS ;
-  int COLS ;
-  
-  NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ;
-  Context<NUMBER> ctx ;
-  testcase* tc ;
-  _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ;
-  _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ;
-
-  UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ;
-  UNION_TYPE rs , rsN ;
-  _256_TYPE distmSel;
-  _256_TYPE distm, _1_distm;
-
-} ;
-*/
-#endif // MUSTAFA
-
-template<class NUMBER> void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
+template<class NUMBER> void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D)
 {
-	NUMBER zero = ctx._(0.0);
-        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
-        for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
+    NUMBER zero = ctx._(0.0);
+    NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+    for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
+    {
+        shiftOutM[s] = zero;
+        shiftOutX[s] = zero;
+        shiftOutY[s] = init_Y;
+    }
+
+    NUMBER *ptr_p_MM = (NUMBER *)p_MM;
+    NUMBER *ptr_p_XX = (NUMBER *)p_XX;
+    NUMBER *ptr_p_YY = (NUMBER *)p_YY;
+    NUMBER *ptr_p_MX = (NUMBER *)p_MX;
+    NUMBER *ptr_p_MY = (NUMBER *)p_MY;
+    NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
+
+    *ptr_p_MM = ctx._(0.0);
+    *ptr_p_XX = ctx._(0.0);
+    *ptr_p_YY = ctx._(0.0);
+    *ptr_p_MX = ctx._(0.0);
+    *ptr_p_MY = ctx._(0.0);
+    *ptr_p_GAPM = ctx._(0.0);
+
+    for (int r = 1; r < ROWS; r++)
+    {
+        int _i = tc->i[r-1] & 127;
+        int _d = tc->d[r-1] & 127;
+        int _c = tc->c[r-1] & 127;
+
+        *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+        *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
+        *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
+        *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
+        *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
+        *(ptr_p_YY+r-1) = ctx.ph2pr[_c];
+    }
+
+    NUMBER *ptr_distm1D = (NUMBER *)distm1D;
+    for (int r = 1; r < ROWS; r++)
+    {
+        int _q = tc->q[r-1] & 127;
+        ptr_distm1D[r-1] = ctx.ph2pr[_q];
+    }
+}
+
+
+template<class NUMBER> inline void CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(
+        int stripeIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
+        SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm,  SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM ,
+        SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
+        UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)
+{
+    int i = stripeIdx;
+    pGAPM = p_GAPM[i];
+    pMM   = p_MM[i];
+    pMX   = p_MX[i];
+    pXX   = p_XX[i];
+    pMY   = p_MY[i];
+    pYY   = p_YY[i];
+
+    NUMBER zero = ctx._(0.0);
+    NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+    UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+    UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(3.0);
+
+    distm = distm1D[i];
+    _1_distm = VEC_SUB(packed1.d, distm);
+
+    distm = VEC_DIV(distm, packed3.d);
+
+    /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */
+    M_t_2.d = VEC_SET1_VAL(zero);
+    X_t_2.d = VEC_SET1_VAL(zero);
+
+    if (i==0) {
+        M_t_1.d = VEC_SET1_VAL(zero);
+        X_t_1.d = VEC_SET1_VAL(zero);
+        Y_t_2.d = VEC_SET_LSE(init_Y);
+        Y_t_1.d = VEC_SET1_VAL(zero);
+    }
+    else {
+        X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);
+        M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);
+        Y_t_2.d = VEC_SET1_VAL(zero);
+        Y_t_1.d = VEC_SET1_VAL(zero);
+    }
+    M_t_1_y = M_t_1;
+}
+
+inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256,
+        SIMD_TYPE distm, SIMD_TYPE _1_distm)
+{
+    UNION_TYPE hapN, rshap;
+    SIMD_TYPE  cond;
+    IF_32 shiftInHap;
+
+    int *hap_ptr = tc->ihap;
+
+    shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];
+
+    /* shift hap */
+    SHIFT_HAP(hap, shiftInHap);
+    SIMD_TYPE hapF = VEC_CVT_128_256(hap);
+
+    rshap.d = VEC_CMP_EQ(rs, hapF);
+    hapN.d  = VEC_CMP_EQ(N_packed256, hapF);
+
+    /* OR rsN, rshap, hapN */
+    cond =  VEC_OR(rsN.d, rshap.d);
+    cond =  VEC_OR(cond, hapN.d);
+
+    /* distm1D = (cond) ? 1-distm1D : distm1D;  */
+    SIMD_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
+
+    return distmSel;
+}
+
+
+inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
+        UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
+        SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel)
+{
+    /* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
+    M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
+
+    M_t_y = M_t;
+
+    /* Compute X_t */
+    X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
+
+    /* Compute Y_t */
+    Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
+}
+
+template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
+{
+    int ROWS = tc->rslen + 1;
+    int COLS = tc->haplen + 1;
+    int MAVX_COUNT = (ROWS+AVX_LENGTH-1)/AVX_LENGTH;
+
+    SIMD_TYPE p_MM   [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX   [MAVX_COUNT];
+    SIMD_TYPE p_XX   [MAVX_COUNT], p_MY   [MAVX_COUNT], p_YY   [MAVX_COUNT];
+    SIMD_TYPE distm1D[MAVX_COUNT];
+    NUMBER shiftOutM[ROWS+COLS+AVX_LENGTH], shiftOutX[ROWS+COLS+AVX_LENGTH], shiftOutY[ROWS+COLS+AVX_LENGTH];
+    UNION_TYPE  M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
+    SIMD_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
+
+    struct timeval start, end;
+    NUMBER result_avx2;
+    Context<NUMBER> ctx;
+    UNION_TYPE rs , rsN;
+    HAP_TYPE hap;
+    SIMD_TYPE distmSel, distmChosen ;
+    SIMD_TYPE distm, _1_distm;
+
+    int r, c;
+    NUMBER zero = ctx._(0.0);
+    UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+    SIMD_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
+    NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+    int remainingRows = (ROWS-1) % AVX_LENGTH;
+    int stripe_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
+
+    const int maskBitCnt = MAIN_TYPE_SIZE ;
+    const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
+
+    MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
+    CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
+
+    char rsArr[AVX_LENGTH] ;
+    MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
+    CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY,
+            ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
+
+    for (int i=0;i<stripe_cnt-1;i++)
+    {
+        //STRIPE_INITIALIZATION
+        CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+                p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
+        CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
+        // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+
+        BITMASK_VEC bitMaskVec ;
+
+        for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
        {
-                shiftOutM[s] = zero;
-                shiftOutX[s] = zero;
-                shiftOutY[s] = init_Y;
+            int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
+            CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+
+            for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
+                CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
+                int ShiftIdx = begin_d + mbi + AVX_LENGTH;
+
+                CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
+                        pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
+
+                CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
+
+                CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
+
+                M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+            }
        }
+    }

-        NUMBER *ptr_p_MM = (NUMBER *)p_MM;
-        NUMBER *ptr_p_XX = (NUMBER *)p_XX;
-        NUMBER *ptr_p_YY = (NUMBER *)p_YY;
-        NUMBER *ptr_p_MX = (NUMBER *)p_MX;
-        NUMBER *ptr_p_MY = (NUMBER *)p_MY;
-        NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
+    int i = stripe_cnt-1;
+    {
+        //STRIPE_INITIALIZATION
+        CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+                p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);

-        *ptr_p_MM = ctx._(0.0);
-        *ptr_p_XX = ctx._(0.0);
-        *ptr_p_YY = ctx._(0.0);
-        *ptr_p_MX = ctx._(0.0);
-        *ptr_p_MY = ctx._(0.0);
-        *ptr_p_GAPM = ctx._(0.0);
+        if (remainingRows==0) remainingRows=AVX_LENGTH;
+        CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;

+        SIMD_TYPE sumM, sumX;
+        sumM = VEC_SET1_VAL(zero);
+        sumX = VEC_SET1_VAL(zero);

-        for (int r = 1; r < ROWS; r++)
+        // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+        BITMASK_VEC bitMaskVec ;
+
+        for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
        {
-                int _i = tc->i[r-1] & 127;
-                int _d = tc->d[r-1] & 127;
-                int _c = tc->c[r-1] & 127;
+            int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
+            CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;

-                *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
-                *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
-                *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
-                *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
-		#ifdef KARTHIK
-	                *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
-        	        *(ptr_p_YY+r-1) = ctx.ph2pr[_c];			
-		#else
- 	               *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-        	       *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
-		#endif
-	
+            for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
+
+                CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
+                int ShiftIdx = begin_d + mbi +AVX_LENGTH;
+
+                CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
+                        pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                sumM  = VEC_ADD(sumM, M_t.d);
+                CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
+
+                sumX  = VEC_ADD(sumX, X_t.d);
+                CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
+
+                CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
+
+                M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+            }
        }
-
-        NUMBER *ptr_distm1D = (NUMBER *)distm1D;
-        for (int r = 1; r < ROWS; r++)
-        {
-                int _q = tc->q[r-1] & 127;
-                ptr_distm1D[r-1] = ctx.ph2pr[_q];
-        }
-}
-
-
-template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)(
-			int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
-			_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
-			_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
-			UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)	 
-{
-	int i = stripIdx;
-        pGAPM = p_GAPM[i];                                               
-        pMM   = p_MM[i];                                                 
-        pMX   = p_MX[i];                                                 
-        pXX   = p_XX[i];                                                 
-        pMY   = p_MY[i];                                                 
-        pYY   = p_YY[i];               
-
-	NUMBER zero = ctx._(0.0);        
-	NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
-	UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
-	UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(3.0);	
-        /* compare rs and N */                                           
-#ifndef MUSTAFA
-        rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
-        rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
-#endif                                                                         
-        distm = distm1D[i];                                    
-        _1_distm = VEC_SUB(packed1.d, distm);                  
-
-	#ifdef KARTHIK
-		distm = VEC_DIV(distm, packed3.d);
-	#endif                                                                         
-        /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */        
-        M_t_2.d = VEC_SET1_VAL(zero);                                    
-        X_t_2.d = VEC_SET1_VAL(zero);                                    
-                                                                         
-        if (i==0) {                                                      
-                M_t_1.d = VEC_SET1_VAL(zero);                            
-                X_t_1.d = VEC_SET1_VAL(zero);                            
-                Y_t_2.d = VEC_SET_LSE(init_Y);                           
-                Y_t_1.d = VEC_SET1_VAL(zero);                            
-        }                                                                
-        else {                                                           
-                X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);            
-                M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);            
-                Y_t_2.d = VEC_SET1_VAL(zero);                            
-                Y_t_1.d = VEC_SET1_VAL(zero);                            
-        }                                                                
-        M_t_1_y = M_t_1;
-}        
-
-
-
-inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
-						_256_TYPE distm, _256_TYPE _1_distm)
-{
-        UNION_TYPE hapN, rshap;
-        _256_TYPE  cond;
-	IF_32 shiftInHap;
-
-	int *hap_ptr = tc->ihap;
-
-        shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];              
-
-        /* shift hap */                                                        
-        SHIFT_HAP(hap, shiftInHap);                                            
-        _256_TYPE hapF = VEC_CVT_128_256(hap);                                 
-
-        rshap.d = VEC_CMP_EQ(rs, hapF);                                        
-        hapN.d  = VEC_CMP_EQ(N_packed256, hapF);                               
-
-        /* OR rsN, rshap, hapN */                                              
-        cond =  VEC_OR(rsN.d, rshap.d);                                        
-        cond =  VEC_OR(cond, hapN.d);                                          
-        
-        /* distm1D = (cond) ? 1-distm1D : distm1D;  */                         
-        _256_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
-        
-        return distmSel;
-}        
-
-
-inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
-			UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
-			_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
-{
-	/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
-	M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
-
-#ifdef DEBUG
-	double *temp1 = (double *)(&pGAPM);
-	double *temp2 = (double *)(&pMM);
-	double *temp3 = (double *)(&distmSel);
-	printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
-	//printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]);
-	printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]);
-#endif
-	M_t_y = M_t;
-
-	/* Compute X_t */
-	X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
-
-	/* Compute Y_t */
-	Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
-}
-
-template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
-{
-        _256_TYPE p_MM   [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX   [MAVX_COUNT];
-	_256_TYPE p_XX   [MAVX_COUNT], p_MY   [MAVX_COUNT], p_YY   [MAVX_COUNT];
-	_256_TYPE distm1D[MAVX_COUNT];
-	NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH];
-	UNION_TYPE  M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
-	_256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
-
-        struct timeval start, end;
-        NUMBER result_avx2;
-        Context<NUMBER> ctx;
-        UNION_TYPE rs , rsN;
-        HAP_TYPE hap;
-        _256_TYPE distmSel, distmChosen ;
-	_256_TYPE distm, _1_distm;
-
-        int r, c;
-        int ROWS = tc->rslen + 1;
-        int COLS = tc->haplen + 1;
-        int AVX_COUNT = (ROWS+7)/8;
-        NUMBER zero = ctx._(0.0);
-        UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
-	_256_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
-        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
-        int remainingRows = (ROWS-1) % AVX_LENGTH;
-        int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
-
-#ifdef MUSTAFA
-	const int maskBitCnt = MAIN_TYPE_SIZE ;
-	const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
-
-	MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
-	GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
-
-	char rsArr[AVX_LENGTH] ;
-	MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
-#endif
-	GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, 
-							    ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
-
-	//for (int __ii=0; __ii < 10; ++__ii)
-        for (int i=0;i<strip_cnt-1;i++)
-        {
-		//STRIP_INITIALIZATION
-		GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
-							      p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
-#ifdef MUSTAFA
-		GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
-#endif
-		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
-
-		BITMASK_VEC bitMaskVec ;
-
-                for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
-                {
-		  int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
-#ifdef MUSTAFA
-		  GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;		  
-#endif
-
-		  //		        if (d % MAIN_TYPE_SIZE == 1)
-		  
-		  for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
-#ifdef MUSTAFA
-		    GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
-#else
-		    distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
-#endif
-		    int ShiftIdx = begin_d + mbi + AVX_LENGTH;
-
-		    GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, 
-								pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
-
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
-		    
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
-
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
-
-		    M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
-		    Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
-		  }
-		}
-	}
-	
-        int i = strip_cnt-1;
-        {
-		//STRIP_INITIALIZATION
-		GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
-                        p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
-
-                if (remainingRows==0) remainingRows=AVX_LENGTH;
-#ifdef MUSTAFA
-		GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;
-#endif
-                _256_TYPE sumM, sumX;
-                sumM = VEC_SET1_VAL(zero);
-                sumX = VEC_SET1_VAL(zero);
-
-		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
-		BITMASK_VEC bitMaskVec ;
-
-                for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
-                {
-
-		  int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
-#ifdef MUSTAFA
-		  GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
-#endif
-
-		  for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
-
-#ifdef MUSTAFA
-		    GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
-#else
-		    distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
-#endif
-		    int ShiftIdx = begin_d + mbi +AVX_LENGTH;
-
-		    GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
-										   pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
-
-		    sumM  = VEC_ADD(sumM, M_t.d);
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
-
-		    sumX  = VEC_ADD(sumX, X_t.d);
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
-
-		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
-
-		    M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
-		    Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
-
-		  }
-		}
-                UNION_TYPE sumMX;
-                sumMX.d = VEC_ADD(sumM, sumX);
-                result_avx2 = sumMX.f[remainingRows-1];		
-	}
-	//printf("result_avx2: %f\n", result_avx2);
-	return result_avx2;
+        UNION_TYPE sumMX;
+        sumMX.d = VEC_ADD(sumM, sumX);
+        result_avx2 = sumMX.f[remainingRows-1];
+    }
+    return result_avx2;
 }

 #endif
--- a/PairHMM_JNI/pairhmm-template-main.cc
+++ b/PairHMM_JNI/pairhmm-template-main.cc
@ -1,10 +1,9 @@
 #include "headers.h"
-
 #include "template.h"
 #include "vector_defs.h"

-#define SIMD_TYPE avx
-#define SIMD_TYPE_AVX
+#define SIMD_ENGINE avx
+#define SIMD_ENGINE_AVX


 #define BATCH_SIZE  10000
@ -13,127 +12,77 @@
 double getCurrClk();
 int thread_level_parallelism_enabled = false ;

-void print128b_F(__m128 x)
-{	
-	float *p = (float *)(&x);
-	for (int i=3;i>=0;i--)
-		printf("%f ", *(p+i));
-	printf("\n");
-}
-
-void print128b_D(__m128d x)
-{
-        double *p = (double *)(&x);
-        for (int i=1;i>=0;i--)
-                printf("%f ", *(p+i));
-        printf("\n");
-}

 int main()
 {
-/*
-	IF_128f x;
-	x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
-	IF_32 shiftIn, shiftOut;
-	shiftIn.f = 5.0f;
-	print128b_F(x.f);
-	GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut);
-	print128b_F(x.f);
+    testcase* tc = new testcase[BATCH_SIZE];
+    float result[BATCH_SIZE], result_avxf;
+    double result_avxd;
+    double lastClk = 0.0 ;
+    double aggregateTimeRead = 0.0;
+    double aggregateTimeCompute = 0.0;
+    double aggregateTimeWrite = 0.0;

-        IF_128d y;
-        y.f = _mm_set_pd(10.0, 11.0);
-        IF_64 shiftInd, shiftOutd;
-        shiftInd.f = 12.0;
-        print128b_D(y.f);
-        GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd);
-        print128b_D(y.f);
+    // Need to call it once to initialize the static array
+    ConvertChar::init() ;

-	exit(0);
-*/
+    //      char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
+    //      if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
+    //        thread_level_parallelism_enabled = true ;
+    //      }

-        testcase* tc = new testcase[BATCH_SIZE];
-        float result[BATCH_SIZE], result_avxf;
-        double result_avxd;
-        //struct timeval start, end;
-	double lastClk = 0.0 ;
-        double aggregateTimeRead = 0.0;
-        double aggregateTimeCompute = 0.0;
-        double aggregateTimeWrite = 0.0;
+    bool noMoreData = false;
+    int count =0;
+    while (!noMoreData)
+    {
+        int read_count = BATCH_SIZE;

-	// Need to call it once to initialize the static array
-	ConvertChar::init() ;
-	
+        lastClk = getCurrClk() ;
+        for (int b=0;b<BATCH_SIZE;b++)
+            if (read_testcase(&tc[b])==-1)
+            {
+                read_count = b;
+                noMoreData = true;
+                break;
+            }
+        aggregateTimeRead += (getCurrClk() - lastClk) ;
+        lastClk = getCurrClk() ;

-//	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
-//	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
-//	  thread_level_parallelism_enabled = true ;
-//	}
-
-        bool noMoreData = false;
-        int count =0;
-        while (!noMoreData)
+        //#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
+        for (int b=0;b<read_count;b++)
        {
-                int read_count = BATCH_SIZE;
- 
-		lastClk = getCurrClk() ;
-                for (int b=0;b<BATCH_SIZE;b++)
-                        if (read_testcase(&tc[b])==-1)
-                        {
-                                read_count = b;
-                                noMoreData = true;
-                                break;
-                        }
-                //gettimeofday(&end, NULL);
-                aggregateTimeRead += (getCurrClk() - lastClk) ;
-		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+            result_avxf = CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), s)<float>(&tc[b]);

-                //gettimeofday(&start, NULL);
-		lastClk = getCurrClk() ;
-
-//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
-                for (int b=0;b<read_count;b++)
-                {
-                        result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)<float>(&tc[b]);
-
-                        #ifdef RUN_HYBRID
-                                #define MIN_ACCEPTED 1e-28f
-                                if (result_avxf < MIN_ACCEPTED) {
-				      //printf("**************** RUNNING DOUBLE ******************\n");
-                                      count++;
-                                      result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)<double>(&tc[b]);
-                                      result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
-                                }
-                                else
-                                      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-                        #endif
-
-                        #ifndef RUN_HYBRID
-                                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-                        #endif
-
-                }
-                //gettimeofday(&end, NULL);
-                aggregateTimeCompute += (getCurrClk() - lastClk) ;
-		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
-
-                //gettimeofday(&start, NULL);
-		lastClk = getCurrClk() ;
-		//for (int b=0;b<read_count;b++)
-		//printf("%E\n", result[b]);
-                //gettimeofday(&end, NULL);
-                aggregateTimeWrite += (getCurrClk() - lastClk) ;
-		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+#ifdef RUN_HYBRID
+#define MIN_ACCEPTED 1e-28f
+            if (result_avxf < MIN_ACCEPTED) {
+                count++;
+                result_avxd = CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), d)<double>(&tc[b]);
+                result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+            }
+            else
+                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif

+#ifndef RUN_HYBRID
+            result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif
        }
+        aggregateTimeCompute += (getCurrClk() - lastClk) ;
+        lastClk = getCurrClk() ;
+        for (int b=0;b<read_count;b++)
+            printf("%E\n", result[b]);
+        aggregateTimeWrite += (getCurrClk() - lastClk) ;
+    }

-	delete tc;
-        printf("AVX Read Time: %.2f\n", aggregateTimeRead);
-        printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
-        printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
-        printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
-        printf("# Double called: %d\n", count);
+    delete tc;
+    printf("AVX Read Time: %.2f\n", aggregateTimeRead);
+    printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
+    printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
+    printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+    printf("# Double called: %d\n", count);

-        return 0;
+    return 0;
 }


--- a/PairHMM_JNI/shift_template.c
+++ b/PairHMM_JNI/shift_template.c
@ -1,88 +1,86 @@
-#ifdef PRECISION 
+#ifdef PRECISION

-#ifdef SIMD_TYPE_AVX
+#ifdef SIMD_ENGINE_AVX

-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
 {
-
-        IF_128 xlow , xhigh;
-        /* cast x to xlow */
-        xlow.f = VEC_CAST_256_128(x.d);
-        /* extract x,1 to xhigh */
-        xhigh.f = VEC_EXTRACT_128(x.d, 1);
-        /* extract xlow[3] */
-        IF_128 shiftOutL128;
-        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
-        /* extract xhigh[3] */
-        IF_MAIN_TYPE shiftOutH;
-        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
-        shiftOut = shiftOutH.f;
-        /* shift xlow */
-        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
-        /* shift xhigh */
-        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
-        /*movss shiftIn to xlow[0] */
-        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
-        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
-        /*movss xlow[3] to xhigh[0] */
-        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
-        /* cast xlow to x */
-        x.d = VEC_CAST_128_256(xlow.f);
-        /* insert xhigh to x,1 */
-        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+    IF_128 xlow , xhigh;
+    /* cast x to xlow */
+    xlow.f = VEC_CAST_256_128(x.d);
+    /* extract x,1 to xhigh */
+    xhigh.f = VEC_EXTRACT_128(x.d, 1);
+    /* extract xlow[3] */
+    IF_128 shiftOutL128;
+    shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+    /* extract xhigh[3] */
+    IF_MAIN_TYPE shiftOutH;
+    shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
+    shiftOut = shiftOutH.f;
+    /* shift xlow */
+    xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+    /* shift xhigh */
+    xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+    /*movss shiftIn to xlow[0] */
+    _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+    xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+    /*movss xlow[3] to xhigh[0] */
+    xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+    /* cast xlow to x */
+    x.d = VEC_CAST_128_256(xlow.f);
+    /* insert xhigh to x,1 */
+    x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
 }


-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
 {
-
-        IF_128 xlow , xhigh;
-        /* cast x to xlow */
-        xlow.f = VEC_CAST_256_128(x.d);
-        /* extract x,1 to xhigh */
-        xhigh.f = VEC_EXTRACT_128(x.d, 1);
-        /* extract xlow[3] */
-        IF_128 shiftOutL128;
-        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
-        /* shift xlow */
-        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
-        /* shift xhigh */
-        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
-        /*movss shiftIn to xlow[0] */
-        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
-        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
-        /*movss xlow[3] to xhigh[0] */
-        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
-        /* cast xlow to x */
-        x.d = VEC_CAST_128_256(xlow.f);
-        /* insert xhigh to x,1 */
-        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+    IF_128 xlow , xhigh;
+    /* cast x to xlow */
+    xlow.f = VEC_CAST_256_128(x.d);
+    /* extract x,1 to xhigh */
+    xhigh.f = VEC_EXTRACT_128(x.d, 1);
+    /* extract xlow[3] */
+    IF_128 shiftOutL128;
+    shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+    /* shift xlow */
+    xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+    /* shift xhigh */
+    xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+    /*movss shiftIn to xlow[0] */
+    _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+    xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+    /*movss xlow[3] to xhigh[0] */
+    xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+    /* cast xlow to x */
+    x.d = VEC_CAST_128_256(xlow.f);
+    /* insert xhigh to x,1 */
+    x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
 }

 #endif

-#ifdef SIMD_TYPE_SSE
+#ifdef SIMD_ENGINE_SSE

-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
 {
-	IF_MAIN_TYPE tempIn, tempOut; 
-	tempIn.f = shiftIn;
-        /* extratc H */
-        tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
-	shiftOut = tempOut.f;
-        /* shift     */
-        x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
-        /* insert  L */
-        x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
+    IF_MAIN_TYPE tempIn, tempOut;
+    tempIn.f = shiftIn;
+    /* extratc H */
+    tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
+    shiftOut = tempOut.f;
+    /* shift     */
+    x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
+    /* insert  L */
+    x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
 }

-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
 {
-	IF_MAIN_TYPE temp; temp.f = shiftIn;
-        /* shift     */
-        x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
-        /* insert  L */
-        x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
+    IF_MAIN_TYPE temp; temp.f = shiftIn;
+    /* shift     */
+    x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
+    /* insert  L */
+    x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
 }

 #endif
--- a/PairHMM_JNI/sse_function_instantiations.cc
+++ b/PairHMM_JNI/sse_function_instantiations.cc
@ -1,10 +1,10 @@
 #include "template.h"

-#undef SIMD_TYPE
-#undef SIMD_TYPE_AVX
+#undef SIMD_ENGINE
+#undef SIMD_ENGINE_AVX

-#define SIMD_TYPE sse
-#define SIMD_TYPE_SSE
+#define SIMD_ENGINE sse
+#define SIMD_ENGINE_SSE

 #include "define-sse-float.h"
 #include "shift_template.c"
--- a/PairHMM_JNI/template.h
+++ b/PairHMM_JNI/template.h
@ -10,11 +10,11 @@
 #define MY 4
 #define YY 5

-#define MROWS  500
-#define MCOLS  1000
+//#define MROWS  500
+//#define MCOLS  1000

 #define CAT(X,Y) X####Y
-#define GEN_INTRINSIC(X,Y) CAT(X,Y)
+#define CONCAT(X,Y) CAT(X,Y)

 #define ALIGNED __attribute__((aligned(32)))

--- a/PairHMM_JNI/utils.cc
+++ b/PairHMM_JNI/utils.cc
@ -96,7 +96,7 @@ int read_testcase(testcase *tc, FILE* ifp)

 	tc->haplen = strlen(tc->hap);
 	tc->rslen = strlen(tc->rs);
-	assert(tc->rslen < MROWS);
+	//assert(tc->rslen < MROWS);
 	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
 	tc->irs = (int *) malloc(tc->rslen*sizeof(int));

@ -216,7 +216,7 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
  //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
  memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
  assert(tokens.size() == 2 + 4*(tc->rslen));
-  assert(tc->rslen < MROWS);
+  //assert(tc->rslen < MROWS);
  for(unsigned j=0;j<tc->rslen;++j)
    tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
  for(unsigned j=0;j<tc->rslen;++j)
--- a/PairHMM_JNI/vector_defs.h
+++ b/PairHMM_JNI/vector_defs.h
@ -1,9 +1,9 @@
-#undef SIMD_TYPE
-#undef SIMD_TYPE_AVX
-#undef SIMD_TYPE_SSE
+#undef SIMD_ENGINE
+#undef SIMD_ENGINE_AVX
+#undef SIMD_ENGINE_SSE

-#define SIMD_TYPE avx
-#define SIMD_TYPE_AVX
+#define SIMD_ENGINE avx
+#define SIMD_ENGINE_AVX

 #include "define-float.h"
 #include "vector_function_prototypes.h"
@ -11,11 +11,11 @@
 #include "define-double.h"
 #include "vector_function_prototypes.h"

-#undef  SIMD_TYPE
-#undef  SIMD_TYPE_AVX
+#undef  SIMD_ENGINE
+#undef  SIMD_ENGINE_AVX

-#define SIMD_TYPE sse
-#define SIMD_TYPE_SSE
+#define SIMD_ENGINE sse
+#define SIMD_ENGINE_SSE


 #include "define-sse-float.h"
@ -24,7 +24,7 @@
 #include "define-sse-double.h"
 #include "vector_function_prototypes.h"

-#undef SIMD_TYPE
-#undef SIMD_TYPE_AVX
-#undef SIMD_TYPE_SSE
+#undef SIMD_ENGINE
+#undef SIMD_ENGINE_AVX
+#undef SIMD_ENGINE_SSE

--- a/PairHMM_JNI/vector_function_prototypes.h
+++ b/PairHMM_JNI/vector_function_prototypes.h
@ -1,19 +1,19 @@
-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen);
-template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D);
-template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(
-    int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
-    _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
-    _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
+inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
+inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
+inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
+inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
+inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
+inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
+template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
+template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
+    int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
+    SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm,  SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , 
+    SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
    UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
-inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
-    _256_TYPE distm, _256_TYPE _1_distm);
-inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, 
+    SIMD_TYPE distm, SIMD_TYPE _1_distm);
+inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
    UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
-    _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel);
-template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
+    SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
+template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);

--- a/build.xml
+++ b/build.xml
@ -270,21 +270,21 @@
        <mkdir dir="${lib.dir}"/>
        <mkdir dir="${ivy.jar.dir}"/>

-	<!-- Comment out the following lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
+	 <!-- Comment out the following lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->

-	<!--<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"-->
-	  <!--dest="${ivy.jar.dir}/${ivy.jar.file}"-->
-	  <!--usetimestamp="true"/>-->
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant"
-                 classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
+	 <get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"
+	     dest="${ivy.jar.dir}/${ivy.jar.file}"
+	     usetimestamp="true"/>
+	 <taskdef resource="org/apache/ivy/ant/antlib.xml"
+	     uri="antlib:org.apache.ivy.ant"
+	     classpath="${ivy.jar.dir}/${ivy.jar.file}"/>

-	       <!--<get src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven-ant-tasks.install.version}/${maven-ant-tasks.jar.file}"-->
-		 <!--dest="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"-->
-		 <!--usetimestamp="true"/>-->
-        <taskdef resource="org/apache/maven/artifact/ant/antlib.xml"
-                 uri="antlib:antlib:org.apache.maven.artifact.ant"
-                 classpath="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"/>
+	 <get src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven-ant-tasks.install.version}/${maven-ant-tasks.jar.file}"
+	     dest="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"
+	     usetimestamp="true"/>
+	 <taskdef resource="org/apache/maven/artifact/ant/antlib.xml"
+	     uri="antlib:antlib:org.apache.maven.artifact.ant"
+	     classpath="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"/>

        <!-- End network lines -->