From 5fab96b7ee5181e0164ccef6a135952f58726dad Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Tue, 14 Jan 2014 17:26:55 -0800
Subject: [PATCH 1/6] First import of AVX-JNI to git

---
 .gitignore                                    |   6 +
 CHEATSHEET                                    |   2 +
 Makefile                                      |  43 ++
 README                                        |  41 ++
 define-double.h                               | 154 ++++++
 define-float.h                                | 155 ++++++
 hmm_mask.cc                                   | 484 ++++++++++++++++++
 ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 146 ++++++
 ...te_sting_utils_pairhmm_JNILoglessPairHMM.h |  61 +++
 pairhmm-1-base.cc                             | 276 ++++++++++
 pairhmm-template-kernel.cc                    | 399 +++++++++++++++
 pairhmm-template-main.cc                      | 114 +++++
 shift_template.c                              |  68 +++
 template.h                                    | 226 ++++++++
 14 files changed, 2175 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CHEATSHEET
 create mode 100644 Makefile
 create mode 100644 README
 create mode 100644 define-double.h
 create mode 100644 define-float.h
 create mode 100644 hmm_mask.cc
 create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
 create mode 100644 org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
 create mode 100644 pairhmm-1-base.cc
 create mode 100644 pairhmm-template-kernel.cc
 create mode 100644 pairhmm-template-main.cc
 create mode 100644 shift_template.c
 create mode 100644 template.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..16e86f25d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+.svn
+*.o
+*.so
+tests
+.deps
+hmm_Mohammad
diff --git a/CHEATSHEET b/CHEATSHEET
new file mode 100644
index 000000000..f61bfdd7f
--- /dev/null
+++ b/CHEATSHEET
@@ -0,0 +1,2 @@
+/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX; ./pairhmm-template-main.o < mediumTest.in > med.out; head -n -5 med.out > medium.out; lua checkLikelihoods.lua medium.out ~/hmm/mediumTest.out 6e-5
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..df5dbc463
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,43 @@
+#OMPCFLAGS=-fopenmp
+#OMPLDFLAGS=-lgomp
+
+#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q            -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+#CFLAGS=-O2             -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+
+JAVA_ROOT=/opt/jdk1.7.0_25/
+JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux
+
+CFLAGS=-g -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
+
+CXXFLAGS=$(CFLAGS)
+CC=icc
+CXX=icc
+
+LDFLAGS=-lm $(OMPLDFLAGS)
+
+#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
+BIN:=libJNILoglessPairHMM.so #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
+
+#SOURCES=pairhmm-1-base.cc input.cc 
+SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc
+OBJECTS=$(SOURCES:.cc=.o)
+DEPDIR=.deps
+DF=$(DEPDIR)/$(*).d
+
+all: $(BIN)
+
+-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
+
+pairhmm-1-base:	pairhmm-1-base.o input.o
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+libJNILoglessPairHMM.so: $(OBJECTS) 
+	$(CXX) -shared -o libJNILoglessPairHMM.so $(OBJECTS) 
+
+%.o: %.cc
+	@mkdir -p $(DEPDIR)
+	$(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+
+
+clean:
+	rm -f $(BIN) *.o
diff --git a/README b/README
new file mode 100644
index 000000000..e16df5a8d
--- /dev/null
+++ b/README
@@ -0,0 +1,41 @@
+How to compile:
+/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX
+
+------------------------------------
+How to run:
+./pairhmm-template-main.o < ./test_data/largeTest.in > large.out
+./pairhmm-template-main.o < ./test_data/mediumTest.in > medium.out
+-------------------------------------
+Results:
+-------------------------------------
+MEDIUM FLOAT:
+AVX Read Time: 8019020
+AVX Compute Time: 29270238
+AVX Write Time: 242170
+AVX Total Time: 37531428
+# Double called: 0
+-------------------------------------
+MEDIUM HYBRID:
+AVX Read Time: 7596629
+AVX Compute Time: 29979220
+AVX Write Time: 248485
+AVX Total Time: 37824334
+# Double called: 20329
+-------------------------------------
+LARGE FLOAT:
+AVX Read Time: 111888586
+AVX Compute Time: 408131806
+AVX Write Time: 3477892
+AVX Total Time: 523498284
+# Double called: 0
+-------------------------------------
+LARGE HYBRID:
+AVX Read Time: 109728994
+AVX Compute Time: 419475317
+AVX Write Time: 3575364
+AVX Total Time: 532779675
+# Double called: 310042
+
+
+
+
diff --git a/define-double.h b/define-double.h
new file mode 100644
index 000000000..b2de7520a
--- /dev/null
+++ b/define-double.h
@@ -0,0 +1,154 @@
+#include <iostream>
+
+#ifdef PRECISION
+        #undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+	#undef _256_TYPE
+	#undef AVX_LENGTH
+	#undef MAVX_COUNT
+	#undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+        #undef SET_VEC_ZERO
+        #undef VEC_OR
+        #undef VEC_ADD
+	#undef VEC_SUB
+        #undef VEC_MUL
+        #undef VEC_BLEND
+        #undef VEC_BLENDV
+	#undef VEC_CAST_256_128
+	#undef VEC_EXTRACT_128
+	#undef VEC_EXTRACT_UNIT
+	#undef VEC_SET1_VAL128
+	#undef VEC_MOVE
+	#undef VEC_CAST_128_256
+	#undef VEC_INSERT_VAL
+	#undef VEC_CVT_128_256
+	#undef VEC_SET1_VAL
+	#undef VEC_POPCVT_CHAR
+	#undef VEC_LDPOPCVT_CHAR
+	#undef VEC_CMP_EQ
+	#undef VEC_SET_LSE
+	#undef SHIFT_HAP
+	#undef print256b
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX
+        #undef VEC_SHIFT_LEFT_1BIT
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS
+        #undef _256_INT_TYPE
+#endif
+
+#define PRECISION d
+#define MAIN_TYPE double
+#define MAIN_TYPE_SIZE 64
+#define UNION_TYPE mix_D
+#define IF_128 IF_128d
+#define IF_MAIN_TYPE IF_64
+#define SHIFT_CONST1 8
+#define SHIFT_CONST2 1
+#define SHIFT_CONST3 8
+#define _128_TYPE __m128d
+#define _256_TYPE __m256d
+#define _256_INT_TYPE __m256i
+#define AVX_LENGTH 4
+#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
+#define HAP_TYPE __m128i
+#define MASK_TYPE uint64_t
+#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF
+#define MASK_VEC MaskVec_D
+
+#define SET_VEC_ZERO(__vec)                     \
+  __vec= _mm256_setzero_pd()
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm256_or_pd(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm256_add_pd(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm256_sub_pd(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm256_mul_pd(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm256_blend_pd(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm256_blendv_pd(__v1, __v2, __maskV)
+
+#define VEC_CAST_256_128(__v1)			\
+  _mm256_castpd256_pd128 (__v1)
+
+#define VEC_EXTRACT_128(__v1, __im)		\
+  _mm256_extractf128_pd (__v1, __im)
+
+#define VEC_EXTRACT_UNIT(__v1, __im)		\
+  _mm_extract_epi64(__v1, __im)
+
+#define VEC_SET1_VAL128(__val)	 		\
+  _mm_set1_pd(__val)
+
+#define VEC_MOVE(__v1, __val)			\
+  _mm_move_sd(__v1, __val)
+
+#define VEC_CAST_128_256(__v1)			\
+  _mm256_castpd128_pd256(__v1)		
+
+#define VEC_INSERT_VAL(__v1, __val, __pos)	\
+  _mm256_insertf128_pd(__v1, __val, __pos)
+
+#define VEC_CVT_128_256(__v1)			\
+  _mm256_cvtepi32_pd(__v1)
+
+#define VEC_SET1_VAL(__val)			\
+  _mm256_set1_pd(__val)
+
+#define VEC_POPCVT_CHAR(__ch)			\
+  _mm256_cvtepi32_pd(_mm_set1_epi32(__ch))
+
+#define VEC_LDPOPCVT_CHAR(__addr)               \
+  _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))
+
+#define VEC_CMP_EQ(__v1, __v2)			\
+  _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)
+
+#define VEC_SET_LSE(__val)                      \
+  _mm256_set_pd(zero, zero, zero, __val);
+
+#define SHIFT_HAP(__v1, __val)			\
+  __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
+
+#define print256b(__v1)				\
+  print256bDP(__v1)
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castpd128_pd256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi64(__vs, 1) 
+
+
+#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
+    double* ptr1 = (double*) (&__v1) ;					\
+    double* ptr2 = (double*) (&__v2) ;					\
+    for (int ei=__first; ei <= __last; ++ei) {				\
+      if (ptr1[ei] != ptr2[ei]) {					\
+	std::cout << "Double Mismatch at " << ei << ": "		\
+		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
+	exit(0) ;							\
+      }									\
+    }									\
+  }
diff --git a/define-float.h b/define-float.h
new file mode 100644
index 000000000..4cba9a179
--- /dev/null
+++ b/define-float.h
@@ -0,0 +1,155 @@
+#include <iostream>
+
+#ifdef PRECISION
+	#undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+        #undef _256_TYPE
+        #undef AVX_LENGTH
+        #undef MAVX_COUNT
+	#undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+	#undef SET_VEC_ZERO(__vec)
+	#undef VEC_OR(__v1, __v2)
+	#undef VEC_ADD(__v1, __v2)
+        #undef VEC_SUB(__v1, __v2)
+	#undef VEC_MUL(__v1, __v2)
+	#undef VEC_BLEND(__v1, __v2, __mask)
+	#undef VEC_BLENDV(__v1, __v2, __maskV)
+        #undef VEC_CAST_256_128(__v1)
+        #undef VEC_EXTRACT_128(__v1, __im)
+        #undef VEC_EXTRACT_UNIT(__v1, __im)
+        #undef VEC_SET1_VAL128(__val)
+        #undef VEC_MOVE(__v1, __val)
+        #undef VEC_CAST_128_256(__v1)
+        #undef VEC_INSERT_VAL(__v1, __val, __pos)
+	#undef VEC_CVT_128_256(__v1)	
+	#undef VEC_SET1_VAL(__val)
+	#undef VEC_POPCVT_CHAR(__ch)
+	#undef VEC_LDPOPCVT_CHAR(__addr)
+	#undef VEC_CMP_EQ(__v1, __v2)
+	#undef VEC_SET_LSE(__val)
+	#undef SHIFT_HAP(__v1, __val)
+	#undef print256b(__v1)
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+        #undef VEC_SHIFT_LEFT_1BIT(__vs)
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS(__v1, __v2)
+        #undef _256_INT_TYPE
+#endif
+
+#define PRECISION s
+
+#define MAIN_TYPE float
+#define MAIN_TYPE_SIZE 32
+#define UNION_TYPE mix_F
+#define IF_128 IF_128f
+#define IF_MAIN_TYPE IF_32
+#define SHIFT_CONST1 12
+#define SHIFT_CONST2 3
+#define SHIFT_CONST3 4
+#define _128_TYPE __m128
+#define _256_TYPE __m256
+#define _256_INT_TYPE __m256i
+#define AVX_LENGTH 8
+#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
+#define HAP_TYPE UNION_TYPE
+#define MASK_TYPE uint32_t
+#define MASK_ALL_ONES 0xFFFFFFFF
+#define MASK_VEC MaskVec_F
+
+#define SET_VEC_ZERO(__vec)                     \
+  __vec= _mm256_setzero_ps()
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm256_or_ps(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm256_add_ps(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm256_sub_ps(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm256_mul_ps(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm256_blend_ps(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm256_blendv_ps(__v1, __v2, __maskV)
+
+#define VEC_CAST_256_128(__v1)			\
+  _mm256_castps256_ps128 (__v1)
+
+#define VEC_EXTRACT_128(__v1, __im)             \
+  _mm256_extractf128_ps (__v1, __im)
+
+#define VEC_EXTRACT_UNIT(__v1, __im)		\
+  _mm_extract_epi32(__v1, __im)
+
+#define VEC_SET1_VAL128(__val)			\
+  _mm_set1_ps(__val)
+
+#define VEC_MOVE(__v1, __val)			\
+  _mm_move_ss(__v1, __val)
+
+#define VEC_CAST_128_256(__v1)                  \
+  _mm256_castps128_ps256(__v1)
+
+#define VEC_INSERT_VAL(__v1, __val, __pos)      \
+  _mm256_insertf128_ps(__v1, __val, __pos)
+
+#define VEC_CVT_128_256(__v1)                   \
+  _mm256_cvtepi32_ps(__v1.i)
+
+#define VEC_SET1_VAL(__val)                 	\
+  _mm256_set1_ps(__val)
+
+#define VEC_POPCVT_CHAR(__ch)                   \
+  _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))
+
+#define VEC_LDPOPCVT_CHAR(__addr)		\
+   _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))
+
+#define VEC_CMP_EQ(__v1, __v2)                  \
+  _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)
+
+#define VEC_SET_LSE(__val)			\
+  _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
+
+#define SHIFT_HAP(__v1, __val)			\
+  _vector_shift_lasts(__v1, __val.f);
+
+#define print256b(__v1)                         \
+  print256bFP(__v1)
+  
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi32(__vs, 1) 
+
+#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
+    float* ptr1 = (float*) (&__v1) ;					\
+    float* ptr2 = (float*) (&__v2) ;					\
+    for (int ei=__first; ei <= __last; ++ei) {				\
+      if (ptr1[ei] != ptr2[ei]) {					\
+	std::cout << "Float Mismatch at " << ei << ": "			\
+		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
+	exit(0) ;							\
+      }									\
+    }									\
+  }
+
diff --git a/hmm_mask.cc b/hmm_mask.cc
new file mode 100644
index 000000000..aace7acf2
--- /dev/null
+++ b/hmm_mask.cc
@@ -0,0 +1,484 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX  -openmp -openmp-link static
+
+
+#include <iostream>
+
+#include <malloc.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <smmintrin.h>
+
+
+#include <stdint.h>
+#include <algorithm>
+#include <vector>
+#include <set>
+#include <map>
+#include <memory.h>
+#include <string>
+#include <sstream>
+#include <fstream>
+
+using namespace std ;
+
+#define CHECK_MASK_CORRECTNESS
+
+template <class T>
+string getBinaryStr (T val, int numBitsToWrite) {
+  
+  ostringstream oss ;
+  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
+  for (int i=numBitsToWrite-1; i >= 0; --i) {
+    oss << ((val & mask) >> i) ;
+    mask >>= 1 ;
+  }
+  return oss.str() ;
+}
+
+
+typedef struct 
+{
+	int rslen, haplen, *q, *i, *d, *c;
+	char *hap, *rs;
+} testcase;
+
+int normalize(char c)
+{
+	return ((int) (c - 33));
+}
+
+class ConvertChar {
+
+  static uint8_t conversionTable[255] ;
+
+
+public:
+
+  static void init() {
+    conversionTable['A'] = 0 ;
+    conversionTable['C'] = 1 ;
+    conversionTable['T'] = 2 ;
+    conversionTable['G'] = 3 ;
+    conversionTable['N'] = 4 ;
+  }
+
+  static inline uint8_t get(uint8_t input) {
+    return conversionTable[input] ;
+  }
+
+} ;
+
+uint8_t ConvertChar::conversionTable[255] ;
+
+
+
+int read_testcase(testcase *tc, FILE* ifp)
+{
+	char *q, *i, *d, *c, *line = NULL;
+	int _q, _i, _d, _c;
+	int x, size = 0;
+	ssize_t read;
+
+	read = getline(&line, (size_t *) &size, ifp);
+	if (read == -1)
+		return -1;
+
+
+	tc->hap = (char *) malloc(size);
+	tc->rs = (char *) malloc(size);
+	q = (char *) malloc(size);
+	i = (char *) malloc(size);
+	d = (char *) malloc(size);
+	c = (char *) malloc(size);
+
+	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+		return -1;
+
+	tc->haplen = strlen(tc->hap);
+	tc->rslen = strlen(tc->rs);
+	tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+	// Convert hap and rs to 3 bits
+	
+	/*
+	for (int ci=0; ci < tc->haplen; ++ci)
+	  tc->hap[ci] = ConvertChar::get(tc->hap[ci]) ;
+
+	for (int ci=0; ci < tc->rslen; ++ci)
+	  tc->rs[ci] = ConvertChar::get(tc->rs[ci]) ;
+	*/
+
+	for (x = 0; x < tc->rslen; x++)
+	{
+		_q = normalize(q[x]);
+		_i = normalize(i[x]);
+		_d = normalize(d[x]);
+		_c = normalize(c[x]);
+		tc->q[x] = (_q < 6) ? 6 : _q;
+		tc->i[x] = _i;
+		tc->d[x] = _d;
+		tc->c[x] = _c;
+	}
+	
+
+	free(q);
+	free(i);
+	free(d);
+	free(c);
+	free(line);
+
+	return 0;
+}
+
+
+
+
+#define ALIGN __attribute__ ((aligned(32)))
+
+
+typedef union ALIGN { 
+  //__m256i vi; 
+  __m128 vf ;
+  __m128i vi ;
+  __m128d vd; 
+
+  uint8_t b[16];
+  //uint16_t hw[8] ;
+  uint32_t w[4] ;
+  uint64_t dw[2] ;
+  
+  float f[4] ;
+  //double d[2] ;
+
+} v128 ;
+
+typedef union ALIGN { 
+  __m256i vi; 
+  __m256 vf ;
+  __m256d vd; 
+  //__m128i vi128[2] ;
+
+  uint8_t b[32];
+  //uint16_t hw[16] ;
+  uint32_t w[8] ;
+  uint64_t dw[4] ;
+  
+  float f[8] ;
+  double d[4] ;
+
+} v256 ;
+
+
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+#define VEC_ENTRY_CNT 8
+#define VEC_LEN 256
+#define VTYPE vf
+#define VTYPEI vi
+#define VENTRY f
+#define VENTRYI w
+#define MTYPE uint32_t
+#define MASK_ALL_ONES 0xFFFFFFFF
+
+
+#define VECTOR v256
+#define VECTOR_SSE v128
+
+#define SET_VEC_ZERO(__vec)			\
+  __vec= _mm256_setzero_ps()
+
+#define SET_VEC_ONES(__vec)			\
+  __vec = _mm256_set1_epi32(0xFFFFFFFF)
+
+#define VEC_OR(__v1, __v2)			\
+  _mm256_or_ps(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)			\
+  _mm256_add_ps(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)			\
+  _mm256_mul_ps(__v1, __v2) 
+
+#define VEC_BLENDV(__v1, __v2, __maskV)		\
+  _mm256_blendv_ps(__v1, __v2, __maskV)
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
+
+#define VECS_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi32(__vs, 1) 
+
+
+#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) {				\
+    uint64_t __mask = ;							\
+    uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \
+    __shiftWord <<= (64-cnt) ;						\
+    __vs = _mm_slri_epi64(__vs, __cnt) ;				\
+    __vs.dw[0] |= __shiftWord ;						\
+  }
+
+
+#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+    MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
+    MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+    __mask >>= __shiftBy ;						\
+    __mask |= __lastShiftOut ;						\
+    __lastShiftOut = __nextShiftOut ;					\
+  }
+
+
+#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+  MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
+  MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
+  __lastShiftOut = __nextShiftOut ;					\ 
+}
+
+
+void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs,
+			  MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
+
+  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
+
+  for (int vi=0; vi < numMaskVecs; ++vi) {
+    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
+      maskArr[vi][rs] = 0 ;
+    }
+    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
+  }
+ 
+  for (int col=1; col < COLS; ++col) {
+    int mIndex = (col-1) / maskBitCnt ;
+    int mOffset = (col-1) % maskBitCnt ;
+    MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ;
+
+    char hapChar = tc.hap[col-1] ;
+
+    if (hapChar == AMBIG_CHAR) {
+      maskArr[mIndex][0] |= bitMask ;
+      maskArr[mIndex][1] |= bitMask ;
+      maskArr[mIndex][2] |= bitMask ;
+      maskArr[mIndex][3] |= bitMask ;
+    } 
+
+    //cout << hapChar << " " << mIndex << " " << getBinaryStr<MTYPE>(bitMask, 32)
+    // << endl ;
+    //cout << getBinaryStr<MTYPE>(maskArr[0][hapChar],32) << endl ;
+    //exit(0) ;
+		     
+    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+
+
+    // bit corresponding to col 1 will be the MSB of the mask 0
+    // bit corresponding to col 2 will be the MSB-1 of the mask 0
+    // ...
+    // bit corresponding to col 32 will be the LSB of the mask 0
+    // bit corresponding to col 33 will be the MSB of the mask 1
+    // ...
+  }
+
+}
+
+
+void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {
+
+  int ROWS = tc.rslen + 1 ;
+  int COLS = tc.haplen + 1 ;
+
+  // only for testing
+  VECTOR mismatchData, matchData ;
+  //SET_VEC_ZERO(mismatchData.VTYPE) ;
+  //SET_VEC_ONES(matchData.VTYPEI) ;
+  for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) {
+    matchData.VENTRY[ei] = 1.0 ;
+    mismatchData.VENTRY[ei] = 0.0 ;
+  }
+  
+  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
+  const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
+
+  MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
+  precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ;
+
+#ifdef DEBUG 
+  if (printDebug) {
+    cout << "The first 32 hap chars are: " ;
+    for (int i=0; i < 32; ++i) {
+      cout << tc.hap[i] ;
+    }
+    cout << endl ;
+
+    cout << "Masks computed for A, C, T, G, N are: "  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][0], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][1], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][2], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][3], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][4], 32)  << endl ;
+  }
+#endif // #ifdef DEBUG 
+
+  int beginRowIndex = 1 ;
+  while (beginRowIndex < ROWS) {
+
+    int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ;
+
+    char rsArr[VEC_ENTRY_CNT] ;
+    for (int ri=0; ri < numRowsToProcess; ++ri) {
+      rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
+    }
+
+    // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+    VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3
+    VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7
+
+    MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ;
+    for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) 
+      lastMaskShiftOut[ei] = 0 ;
+
+    int col = 1 ;
+    int diag = 1 ;
+    for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) {
+      // set up the mask vectors for the next maskBitCnt columns
+
+      // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols)
+      for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) {
+	SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], 
+		      lastMaskShiftOut[ei], ei, maskBitCnt) ;
+
+      	int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index
+	SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], 
+		      lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+      }
+
+#ifdef DEBUG 
+      if (printDebug && maskIndex == 0) {
+	cout << "The masks for entry 1: " << endl  
+	     << getBinaryStr<MTYPE>(maskArr[0][rsArr[1]], 32) << endl
+	     << getBinaryStr<MTYPE>(currMaskVecLow.VENTRYI[1], 32) << endl ;
+
+      }
+#endif // #ifdef DEBUG 
+      
+      // iterate over mask bit indices and columns
+      for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) {
+
+	VECTOR maskV ;
+	VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ;
+
+	VECTOR testData ;
+	testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE,
+				    maskV.VTYPE) ;
+
+	VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ;
+	VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ;
+
+#ifdef DEBUG 
+	if (printDebug && maskIndex == 0) {
+	  cout << "The mask for entry 1, mbi=" << mbi << ": "
+	       << getBinaryStr<MTYPE>(maskV.VENTRYI[1], 32) << endl ;
+
+	}
+#endif // #ifdef DEBUG 
+
+#ifdef CHECK_MASK_CORRECTNESS
+
+	int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ;
+	int lastRowIndex = min(col-1, numRowsToProcess-1) ;
+
+	for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) {
+	  int currRow = beginRowIndex + ri ;
+	  int currCol = col - ri + firstRowIndex ;
+
+	  char hapChar = tc.hap[currCol-1] ;
+	  char rsChar = tc.rs[currRow-1] ;
+
+	  bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ;
+
+	  if ((bool) testData.VENTRYI[ri] != match) {
+	    cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag
+		 << " (" << currRow  << ", " << currCol << ")" << endl ;
+
+	    cout << "The chars are: " << hapChar << " and " << rsChar << endl ;
+	    cout << "The selected value is: " << testData.VENTRYI[ri] << endl ;
+
+	    exit(0) ;
+	  }
+	}
+#endif // #ifdef CHECK_MASK_CORRECTNESS
+
+	if (diag < COLS)
+	  ++col ;
+
+      } // mbi
+    } // maskIndex
+
+    beginRowIndex += VEC_ENTRY_CNT ;
+  } // end of stripe
+
+  //cout << "Finished validating entry " << endl ;
+}
+
+
+int main () {
+
+  #define BATCH_SIZE 10000
+
+  ConvertChar::init() ;
+  
+  testcase* tcBatch = new testcase[BATCH_SIZE] ;
+  
+  int numBatches = 0 ;
+  int numRead = 0 ;
+
+  const int DEBUG_TC = -1 ;
+
+  FILE* ifp = stdin ;
+
+  do {
+    numRead = 0 ;
+
+    while (numRead < BATCH_SIZE) {
+      if (read_testcase(tcBatch+numRead, ifp) < 0)
+	break ;
+      
+      ++numRead ;
+    }
+
+    if (numRead == 0)
+      break ;
+
+    for (int ti=0; ti < numRead; ++ti) {
+
+      int tcID = numBatches * BATCH_SIZE + ti ;
+      test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ;
+    }
+
+    ++numBatches ;
+  } while (numRead == BATCH_SIZE) ;
+  
+  fclose(ifp) ;
+
+  delete[] tcBatch ;
+
+  return 0 ;
+}
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
new file mode 100644
index 000000000..80065b8c9
--- /dev/null
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@@ -0,0 +1,146 @@
+#include <jni.h>
+#include <assert.h>
+#include <stdio.h>
+#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
+
+#include <cstdio>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+
+#include "template.h"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+using namespace std;
+
+
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+//#define DEBUG3 1
+#define DEBUG 1
+
+template<class T>
+string to_string(T obj)
+{
+  stringstream ss;
+  string ret_string;
+  ss.clear();
+  ss << std::scientific << obj;
+  ss >> ret_string;
+  ss.clear();
+  return ret_string;
+}
+
+void debug_dump(string filename, string s, bool to_append, bool add_newline=true)
+{
+  ofstream fptr;
+  fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
+  fptr << s;
+  if(add_newline)
+    fptr << "\n";
+  fptr.close();
+}
+
+#define INT_STORE_ARRAY_SIZE 2048
+int insertionGOPIntArray[INT_STORE_ARRAY_SIZE];
+int deletionGOPIntArray[INT_STORE_ARRAY_SIZE];
+int overallGCPIntArray[INT_STORE_ARRAY_SIZE];
+int readQualsIntArray[INT_STORE_ARRAY_SIZE];
+
+JNIEXPORT jdouble JNICALL 
+Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( 
+    JNIEnv* env, jobject thisObject,
+    jint readLength, jint haplotypeLength,
+    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
+    jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP,
+    jint hapStartIndex
+    )
+{
+  jboolean is_copy = JNI_FALSE;
+  jbyte* readBasesArray = (env)->GetByteArrayElements(readBases, &is_copy);
+  jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy);
+  jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy);
+  jbyte* insertionGOPArray = (env)->GetByteArrayElements(insertionGOP, &is_copy);
+  jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy);
+  jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy);
+#ifdef DEBUG
+  assert(readBasesArray && "readBasesArray not initialized in JNI"); 
+  assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+  assert(readQualsArray && "readQualsArray not initialized in JNI"); 
+  assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
+  assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
+  assert(overallGCPArray && "OverallGCP array not initialized in JNI");
+  assert(readLength < INT_STORE_ARRAY_SIZE);
+#endif
+  for(unsigned i=0;i<readLength;++i)
+  {
+    insertionGOPIntArray[i] = (int)insertionGOPArray[i];
+    deletionGOPIntArray[i] = (int)deletionGOPArray[i];
+    overallGCPIntArray[i] = (int)overallGCPArray[i];
+    readQualsIntArray[i] = (int)readQualsArray[i];
+  }
+
+  testcase tc;
+  tc.rslen = readLength;
+  tc.haplen = haplotypeLength;
+
+  tc.rs = (char*)readBasesArray;
+  tc.hap = (char*)haplotypeBasesArray;
+  tc.q = (int*)readQualsIntArray;
+  tc.i = (int*)insertionGOPIntArray;
+  tc.d = (int*)deletionGOPIntArray;
+  tc.c = (int*)overallGCPIntArray;
+
+  double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+  double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+#ifdef DEBUG
+  debug_dump("return_values_jni.txt",to_string(result),true);
+#endif
+
+
+  env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT);
+  env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT);
+  env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT);
+  env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT);
+  env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT);
+  env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT);
+
+  return 0.0;
+}
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
+(JNIEnv* env, jclass thisObject,
+ jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP
+ )
+{}
+
+  JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize
+(JNIEnv* env, jobject thisObject,
+ jint readMaxLength, jint haplotypeMaxLength)
+{}
+
+JNIEXPORT jdouble JNICALL 
+Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells( 
+    JNIEnv* env, jobject thisObject,
+    jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength,
+    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
+    jint hapStartIndex
+    )
+
+{ return 0.0; }
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
new file mode 100644
index 000000000..5408d2f57
--- /dev/null
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
@@ -0,0 +1,61 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM */
+
+#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+#define _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION 3.0
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch 1L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion 2L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion 3L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitialize
+ * Signature: (II)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize
+  (JNIEnv *, jobject, jint, jint);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitializeProbabilities
+ * Signature: ([[D[B[B[B)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
+  (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitializePriorsAndUpdateCells
+ * Signature: (ZII[B[B[BI)D
+ */
+JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells
+  (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint);
+
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniSubComputeReadLikelihoodGivenHaplotypeLog10
+ * Signature: (II[B[B[B[B[B[BI)D
+ */
+JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10
+    (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/pairhmm-1-base.cc b/pairhmm-1-base.cc
new file mode 100644
index 000000000..dacd02536
--- /dev/null
+++ b/pairhmm-1-base.cc
@@ -0,0 +1,276 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <sys/time.h>
+
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+//#define DEBUG
+
+/*
+	q: read quality
+	i: insertion penalty
+	d: deletion penalty
+	c: gap continuation penalty
+*/
+
+typedef struct 
+{
+	int rslen, haplen, *q, *i, *d, *c;
+	char *hap, *rs;
+} testcase;
+
+int normalize(char c)
+{
+	return ((int) (c - 33));
+}
+
+int read_testcase(testcase *tc)
+{
+	char *q, *i, *d, *c, *line = NULL;
+	int _q, _i, _d, _c;
+	int x, size = 0;
+	ssize_t read;
+
+	read = getline(&line, (size_t *) &size, stdin);
+	if (read == -1)
+		return -1;
+
+	tc->hap = (char *) malloc(size);
+	tc->rs = (char *) malloc(size);
+	q = (char *) malloc(size);
+	i = (char *) malloc(size);
+	d = (char *) malloc(size);
+	c = (char *) malloc(size);
+
+	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+		return -1;
+
+	tc->haplen = strlen(tc->hap);
+	tc->rslen = strlen(tc->rs);
+	tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+	for (x = 0; x < tc->rslen; x++)
+	{
+		_q = normalize(q[x]);
+		_i = normalize(i[x]);
+		_d = normalize(d[x]);
+		_c = normalize(c[x]);
+		tc->q[x] = (_q < 6) ? 6 : _q;
+		tc->i[x] = _i;
+		tc->d[x] = _d;
+		tc->c[x] = _c;
+	}
+
+	free(q);
+	free(i);
+	free(d);
+	free(c);
+	free(line);
+
+	return 0;
+}
+
+template<class T>
+struct Context{};
+
+template<>
+struct Context<double>
+{
+	Context()
+	{
+		for (int x = 0; x < 128; x++)
+			ph2pr[x] = pow(10.0, -((double)x) / 10.0);
+
+		INITIAL_CONSTANT = ldexp(1.0, 1020.0);
+		LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
+		RESULT_THRESHOLD = 0.0;
+	}
+
+	double LOG10(double v){ return log10(v); }
+
+	static double _(double n){ return n; }
+	static double _(float n){ return ((double) n); }
+	double ph2pr[128];
+	double INITIAL_CONSTANT;
+	double LOG10_INITIAL_CONSTANT;
+	double RESULT_THRESHOLD;
+};
+
+template<>
+struct Context<float>
+{
+	Context()
+	{
+		for (int x = 0; x < 128; x++)
+			ph2pr[x] = powf(10.f, -((float)x) / 10.f);
+
+		INITIAL_CONSTANT = ldexpf(1.f, 120.f);
+		LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
+		RESULT_THRESHOLD = ldexpf(1.f, -110.f);
+	}
+
+	float LOG10(float v){ return log10f(v); }
+
+	static float _(double n){ return ((float) n); }
+	static float _(float n){ return n; }
+	float ph2pr[128];
+	float INITIAL_CONSTANT;
+	float LOG10_INITIAL_CONSTANT;
+	float RESULT_THRESHOLD;
+};
+
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
+{
+	int r, c;
+	int ROWS = tc->rslen + 1;
+	int COLS = tc->haplen + 1;
+
+	Context<NUMBER> ctx;
+
+	NUMBER M[ROWS][COLS];
+	NUMBER X[ROWS][COLS];
+	NUMBER Y[ROWS][COLS];
+	NUMBER p[ROWS][6];
+
+	p[0][MM] = ctx._(0.0);
+	p[0][GapM] = ctx._(0.0);
+	p[0][MX] = ctx._(0.0);
+	p[0][XX] = ctx._(0.0);
+	p[0][MY] = ctx._(0.0);
+	p[0][YY] = ctx._(0.0);
+	for (r = 1; r < ROWS; r++)
+	{
+		int _i = tc->i[r-1] & 127;
+		int _d = tc->d[r-1] & 127;
+		int _c = tc->c[r-1] & 127;
+		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
+		p[r][MX] = ctx.ph2pr[_i];
+		p[r][XX] = ctx.ph2pr[_c];
+		p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+	}
+
+	for (c = 0; c < COLS; c++)
+	{
+		M[0][c] = ctx._(0.0);
+		X[0][c] = ctx._(0.0);
+		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
+	}
+
+	for (r = 1; r < ROWS; r++)
+	{
+		M[r][0] = ctx._(0.0);
+		X[r][0] = X[r-1][0] * p[r][XX];
+		Y[r][0] = ctx._(0.0);
+	}
+
+	 NUMBER result = ctx._(0.0);
+
+	for (r = 1; r < ROWS; r++)
+		for (c = 1; c < COLS; c++)
+		{
+			char _rs = tc->rs[r-1];
+			char _hap = tc->hap[c-1];
+			int _q = tc->q[r-1] & 127;
+			NUMBER distm = ctx.ph2pr[_q];
+			if (_rs == _hap || _rs == 'N' || _hap == 'N')
+				distm = ctx._(1.0) - distm;
+			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
+			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
+			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
+		}
+
+	for (c = 0; c < COLS; c++)
+	{
+		result += M[ROWS-1][c] + X[ROWS-1][c];
+	}
+
+	if (before_last_log != NULL)
+		*before_last_log = result;
+
+	return result; //ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
+}
+
+#define BATCH_SIZE  10000
+#define RUN_HYBRID
+
+int main()
+{
+        testcase tc[BATCH_SIZE];
+        float result[BATCH_SIZE], result_avxf;
+        double result_avxd;
+        struct timeval start, end;
+        long long aggregateTimeRead = 0L;
+        long long aggregateTimeCompute = 0L;
+        long long aggregateTimeWrite = 0L;
+
+        bool noMoreData = false;
+        int count =0;
+        while (!noMoreData)
+        {
+                int read_count = BATCH_SIZE;
+                gettimeofday(&start, NULL);
+                for (int b=0;b<BATCH_SIZE;b++)
+                        if (read_testcase(&tc[b])==-1)
+                        {
+                                read_count = b;
+                                noMoreData = true;
+                                break;
+                        }
+                gettimeofday(&end, NULL);
+                aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                gettimeofday(&start, NULL);
+                for (int b=0;b<read_count;b++)
+                {
+                        result_avxf = compute_full_prob<float>(&tc[b]);
+
+                        #ifdef RUN_HYBRID
+                                #define MIN_ACCEPTED 1e-28f
+                                if (result_avxf < MIN_ACCEPTED) {
+                                      count++;
+                                      result_avxd = compute_full_prob<double>(&tc[b]);
+                                      result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+                                }
+                                else
+                                      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                        #ifndef RUN_HYBRID
+                                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                }
+                gettimeofday(&end, NULL);
+                aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                gettimeofday(&start, NULL);
+                for (int b=0;b<read_count;b++)
+                        printf("%E\n", result[b]);
+                gettimeofday(&end, NULL);
+                aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+        }
+
+        printf("AVX Read Time: %ld\n", aggregateTimeRead);
+        printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
+        printf("AVX Write Time: %ld\n", aggregateTimeWrite);
+        printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+        printf("# Double called: %d\n", count);
+
+        return 0;
+}
+
diff --git a/pairhmm-template-kernel.cc b/pairhmm-template-kernel.cc
new file mode 100644
index 000000000..dd71373b0
--- /dev/null
+++ b/pairhmm-template-kernel.cc
@@ -0,0 +1,399 @@
+#ifdef PRECISION
+
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+
+/*
+template <class T>
+string getBinaryStr (T val, int numBitsToWrite) {
+  
+  ostringstream oss ;
+  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
+  for (int i=numBitsToWrite-1; i >= 0; --i) {
+    oss << ((val & mask) >> i) ;
+    mask >>= 1 ;
+  }
+  return oss.str() ;
+}
+*/
+
+void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
+  
+  const int maskBitCnt = MAIN_TYPE_SIZE ;
+
+  for (int vi=0; vi < numMaskVecs; ++vi) {
+    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
+      maskArr[vi][rs] = 0 ;
+    }
+    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
+  }
+ 
+  for (int col=1; col < COLS; ++col) {
+    int mIndex = (col-1) / maskBitCnt ;
+    int mOffset = (col-1) % maskBitCnt ;
+    MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
+
+    char hapChar = tc.hap[col-1] ;
+
+    if (hapChar == AMBIG_CHAR) {
+      for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) 
+	maskArr[mIndex][ci] |= bitMask ;
+    } 
+
+    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+    // bit corresponding to col 1 will be the MSB of the mask 0
+    // bit corresponding to col 2 will be the MSB-1 of the mask 0
+    // ...
+    // bit corresponding to col 32 will be the LSB of the mask 0
+    // bit corresponding to col 33 will be the MSB of the mask 1
+    // ...
+  }
+
+}
+
+void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
+
+  for (int ri=0; ri < numRowsToProcess; ++ri) {
+    rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
+  }
+
+  for (int ei=0; ei < AVX_LENGTH; ++ei) {
+    lastMaskShiftOut[ei] = 0 ;
+  }
+}
+
+#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+  MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ;			\
+  MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
+  __lastShiftOut = __nextShiftOut ;					\
+}
+
+
+void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) {
+
+  for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
+    SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], 
+		  lastMaskShiftOut[ei], ei, maskBitCnt) ;
+    
+    int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
+    SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], 
+		  lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+  }
+
+}
+
+//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
+
+inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
+  //#define computeDistVec() {					      
+
+  _256_TYPE maskV ;
+  VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ;
+
+  distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ;
+
+  /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
+
+  VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ;
+  VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ;
+}								     
+
+
+/*
+template<class NUMBER>
+struct HmmData {
+  int ROWS ;
+  int COLS ;
+  
+  NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ;
+  Context<NUMBER> ctx ;
+  testcase* tc ;
+  _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ;
+  _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ;
+
+  UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ;
+  UNION_TYPE rs , rsN ;
+  _256_TYPE distmSel;
+  _256_TYPE distm, _1_distm;
+
+} ;
+*/
+
+template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
+{
+	NUMBER zero = ctx._(0.0);
+        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+        for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
+        {
+                shiftOutM[s] = zero;
+                shiftOutX[s] = zero;
+                shiftOutY[s] = init_Y;
+        }
+
+        NUMBER *ptr_p_MM = (NUMBER *)p_MM;
+        NUMBER *ptr_p_XX = (NUMBER *)p_XX;
+        NUMBER *ptr_p_YY = (NUMBER *)p_YY;
+        NUMBER *ptr_p_MX = (NUMBER *)p_MX;
+        NUMBER *ptr_p_MY = (NUMBER *)p_MY;
+        NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
+
+        *ptr_p_MM = ctx._(0.0);
+        *ptr_p_XX = ctx._(0.0);
+        *ptr_p_YY = ctx._(0.0);
+        *ptr_p_MX = ctx._(0.0);
+        *ptr_p_MY = ctx._(0.0);
+        *ptr_p_GAPM = ctx._(0.0);
+
+
+        for (int r = 1; r < ROWS; r++)
+        {
+                int _i = tc->i[r-1] & 127;
+                int _d = tc->d[r-1] & 127;
+                int _c = tc->c[r-1] & 127;
+
+                *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+                *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
+                *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
+                *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
+                *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+                *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+        }
+
+        NUMBER *ptr_distm1D = (NUMBER *)distm1D;
+        for (int r = 1; r < ROWS; r++)
+        {
+                int _q = tc->q[r-1] & 127;
+                ptr_distm1D[r-1] = ctx.ph2pr[_q];
+        }
+}
+
+
+template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(
+			int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
+			_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
+			_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
+			UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)	 
+{
+	int i = stripIdx;
+        pGAPM = p_GAPM[i];                                               
+        pMM   = p_MM[i];                                                 
+        pMX   = p_MX[i];                                                 
+        pXX   = p_XX[i];                                                 
+        pMY   = p_MY[i];                                                 
+        pYY   = p_YY[i];               
+
+	NUMBER zero = ctx._(0.0);        
+	NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+	UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+	
+        /* compare rs and N */                                           
+        //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
+        //rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
+                                                                         
+        distm = distm1D[i];                                    
+        _1_distm = VEC_SUB(packed1.d, distm);                  
+                                                                         
+        /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */        
+        M_t_2.d = VEC_SET1_VAL(zero);                                    
+        X_t_2.d = VEC_SET1_VAL(zero);                                    
+                                                                         
+        if (i==0) {                                                      
+                M_t_1.d = VEC_SET1_VAL(zero);                            
+                X_t_1.d = VEC_SET1_VAL(zero);                            
+                Y_t_2.d = VEC_SET_LSE(init_Y);                           
+                Y_t_1.d = VEC_SET1_VAL(zero);                            
+        }                                                                
+        else {                                                           
+                X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);            
+                M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);            
+                Y_t_2.d = VEC_SET1_VAL(zero);                            
+                Y_t_1.d = VEC_SET1_VAL(zero);                            
+        }                                                                
+        M_t_1_y = M_t_1;
+}        
+
+
+
+inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
+						_256_TYPE distm, _256_TYPE _1_distm)
+{
+        UNION_TYPE hapN, rshap;
+        _256_TYPE  cond;
+	IF_32 shiftInHap;
+
+	int *hap_ptr = tc->ihap;
+
+        shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];              
+
+        /* shift hap */                                                        
+        SHIFT_HAP(hap, shiftInHap);                                            
+        _256_TYPE hapF = VEC_CVT_128_256(hap);                                 
+
+        rshap.d = VEC_CMP_EQ(rs, hapF);                                        
+        hapN.d  = VEC_CMP_EQ(N_packed256, hapF);                               
+
+        /* OR rsN, rshap, hapN */                                              
+        cond =  VEC_OR(rsN.d, rshap.d);                                        
+        cond =  VEC_OR(cond, hapN.d);                                          
+        
+        /* distm1D = (cond) ? 1-distm1D : distm1D;  */                         
+        _256_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
+        
+        return distmSel;
+}        
+
+
+inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+			UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
+			_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
+{
+	/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
+	M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
+	M_t_y = M_t;
+
+	/* Compute X_t */
+	X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
+
+	/* Compute Y_t */
+	Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
+}
+
+template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
+{
+        _256_TYPE p_MM   [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX   [MAVX_COUNT];
+	_256_TYPE p_XX   [MAVX_COUNT], p_MY   [MAVX_COUNT], p_YY   [MAVX_COUNT];
+	_256_TYPE distm1D[MAVX_COUNT];
+	NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH];
+	UNION_TYPE  M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
+	_256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
+
+        struct timeval start, end;
+        NUMBER result_avx2;
+        Context<NUMBER> ctx;
+        UNION_TYPE rs , rsN;
+        HAP_TYPE hap;
+        _256_TYPE distmSel, distmChosen ;
+	_256_TYPE distm, _1_distm;
+
+        int r, c;
+        int ROWS = tc->rslen + 1;
+        int COLS = tc->haplen + 1;
+        int AVX_COUNT = (ROWS+7)/8;
+        NUMBER zero = ctx._(0.0);
+        UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+	_256_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
+        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+        int remainingRows = (ROWS-1) % AVX_LENGTH;
+        int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
+
+	const int maskBitCnt = MAIN_TYPE_SIZE ;
+	const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
+
+	MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
+	GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
+
+	char rsArr[AVX_LENGTH] ;
+	MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
+
+	GEN_INTRINSIC(initializeVectors, PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, 
+							    ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
+
+	//for (int __ii=0; __ii < 10; ++__ii)
+        for (int i=0;i<strip_cnt-1;i++)
+        {
+		//STRIP_INITIALIZATION
+		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+							      p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
+
+		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
+							      i*AVX_LENGTH+1, AVX_LENGTH) ;
+		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+		MASK_VEC currMaskVecLow ; // corresponding to lower half
+		MASK_VEC currMaskVecHigh ; // corresponding to upper half
+
+                for (int d=1;d<COLS+AVX_LENGTH;d++)
+                {
+		        if (d % MAIN_TYPE_SIZE == 1)
+			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+
+			int ShiftIdx = d+AVX_LENGTH;
+			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+
+			//int firstRowIndex = (d < COLS) ? 0 : (d-COLS+1) ;
+			//int lastRowIndex = std::min(d-1, AVX_LENGTH-1) ;
+			//GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen, distmSel, firstRowIndex, lastRowIndex) ;
+
+			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
+			
+
+			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, 
+								pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[d]);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[d]);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[d]);
+
+                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+		}
+	}
+
+        int i = strip_cnt-1;
+        {
+		//STRIP_INITIALIZATION
+		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+                        p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
+
+                if (remainingRows==0) remainingRows=AVX_LENGTH;
+
+		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
+							      i*AVX_LENGTH+1, remainingRows) ;
+
+                _256_TYPE sumM, sumX;
+                sumM = VEC_SET1_VAL(zero);
+                sumX = VEC_SET1_VAL(zero);
+
+		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+		MASK_VEC currMaskVecLow ; // corresponding to lower half
+		MASK_VEC currMaskVecHigh ; // corresponding to upper half
+
+                for (int d=1;d<COLS+remainingRows-1;d++)
+                {
+
+  	 	        if (d % MAIN_TYPE_SIZE == 1)
+			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+
+			int ShiftIdx = d+AVX_LENGTH;
+			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
+
+			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
+					                        pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                        sumM  = VEC_ADD(sumM, M_t.d);
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(M_t, shiftOutM[ShiftIdx]);
+
+                        sumX  = VEC_ADD(sumX, X_t.d);
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(X_t, shiftOutX[ShiftIdx]);
+
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
+
+                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+                }
+                UNION_TYPE sumMX;
+                sumMX.d = VEC_ADD(sumM, sumX);
+                result_avx2 = sumMX.f[remainingRows-1];		
+	}
+	//printf("result_avx2: %f\n", result_avx2);
+	return result_avx2;
+}
+
+#endif
+
diff --git a/pairhmm-template-main.cc b/pairhmm-template-main.cc
new file mode 100644
index 000000000..f20755b6b
--- /dev/null
+++ b/pairhmm-template-main.cc
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+
+#include "template.h"
+
+#include "define-float.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#define BATCH_SIZE  10000
+//#define RUN_HYBRID
+
+uint8_t ConvertChar::conversionTable[255] ;
+int thread_level_parallelism_enabled = false ;
+
+double getCurrClk() {
+  struct timeval tv ;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+int main()
+{
+        testcase tc[BATCH_SIZE];
+        float result[BATCH_SIZE], result_avxf;
+        double result_avxd;
+        //struct timeval start, end;
+	double lastClk = 0.0 ;
+        double aggregateTimeRead = 0.0;
+        double aggregateTimeCompute = 0.0;
+        double aggregateTimeWrite = 0.0;
+
+	// Need to call it once to initialize the static array
+	ConvertChar::init() ;
+	
+
+	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
+	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
+	  thread_level_parallelism_enabled = true ;
+	}
+
+        bool noMoreData = false;
+        int count =0;
+        while (!noMoreData)
+        {
+                int read_count = BATCH_SIZE;
+ 
+		lastClk = getCurrClk() ;
+                for (int b=0;b<BATCH_SIZE;b++)
+                        if (read_testcase(&tc[b])==-1)
+                        {
+                                read_count = b;
+                                noMoreData = true;
+                                break;
+                        }
+                //gettimeofday(&end, NULL);
+                aggregateTimeRead += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                //gettimeofday(&start, NULL);
+		lastClk = getCurrClk() ;
+
+#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
+                for (int b=0;b<read_count;b++)
+                {
+                        result_avxf = GEN_INTRINSIC(compute_full_prob_avx, s)<float>(&tc[b]);
+
+                        #ifdef RUN_HYBRID
+                                #define MIN_ACCEPTED 1e-28f
+                                if (result_avxf < MIN_ACCEPTED) {
+                                      count++;
+                                      result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc[b]);
+                                      result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+                                }
+                                else
+                                      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                        #ifndef RUN_HYBRID
+                                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                }
+                //gettimeofday(&end, NULL);
+                aggregateTimeCompute += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                //gettimeofday(&start, NULL);
+		lastClk = getCurrClk() ;
+                for (int b=0;b<read_count;b++)
+                        printf("%E\n", result[b]);
+                //gettimeofday(&end, NULL);
+                aggregateTimeWrite += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+        }
+
+        printf("AVX Read Time: %.2f\n", aggregateTimeRead);
+        printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
+        printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
+        printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+        printf("# Double called: %d\n", count);
+
+        return 0;
+}
+
+
+
diff --git a/shift_template.c b/shift_template.c
new file mode 100644
index 000000000..90160dcb3
--- /dev/null
+++ b/shift_template.c
@@ -0,0 +1,68 @@
+#ifdef PRECISION 
+
+#include "template.h"
+
+
+inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+{
+
+        IF_128 xlow , xhigh;
+        /* cast x to xlow */
+        xlow.f = VEC_CAST_256_128(x.d);
+        /* extract x,1 to xhigh */
+        xhigh.f = VEC_EXTRACT_128(x.d, 1);
+        /* extract xlow[3] */
+        IF_128 shiftOutL128;
+        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+        /* extract xhigh[3] */
+        IF_MAIN_TYPE shiftOutH;
+        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
+        shiftOut = shiftOutH.f;
+        /* shift xlow */
+        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+        /* shift xhigh */
+        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+        /*movss shiftIn to xlow[0] */
+        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+        /*movss xlow[3] to xhigh[0] */
+        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+        /* cast xlow to x */
+        x.d = VEC_CAST_128_256(xlow.f);
+        /* insert xhigh to x,1 */
+        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+}
+
+inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+{
+
+        IF_128 xlow , xhigh;
+        /* cast x to xlow */
+        xlow.f = VEC_CAST_256_128(x.d);
+        /* extract x,1 to xhigh */
+        xhigh.f = VEC_EXTRACT_128(x.d, 1);
+        /* extract xlow[3] */
+        IF_128 shiftOutL128;
+        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+        /* extract xhigh[3] */
+//        IF_MAIN_TYPE shiftOutH;
+//        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
+//        shiftOut = shiftOutH.f;
+        /* shift xlow */
+        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+        /* shift xhigh */
+        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+        /*movss shiftIn to xlow[0] */
+        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+        /*movss xlow[3] to xhigh[0] */
+        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+        /* cast xlow to x */
+        x.d = VEC_CAST_128_256(xlow.f);
+        /* insert xhigh to x,1 */
+        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+}
+
+
+#endif
+
diff --git a/template.h b/template.h
new file mode 100644
index 000000000..ee8036a11
--- /dev/null
+++ b/template.h
@@ -0,0 +1,226 @@
+#ifndef TEMPLATES_H_
+#define TEMPLATES_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <sys/time.h>
+
+#include <immintrin.h>
+
+#include <ctype.h>
+
+#define MROWS  500
+#define MCOLS  1000
+
+#define CAT(X,Y) X####Y
+#define GEN_INTRINSIC(X,Y) CAT(X,Y)
+
+#define ALIGNED __attribute__((aligned(32)))
+
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m256 ALIGNED d;
+        ALIGNED __m128i ALIGNED s[2];
+        ALIGNED float  ALIGNED f[8];
+        ALIGNED __m256i ALIGNED i;
+} ALIGNED mix_F ALIGNED;
+
+typedef union ALIGNED {
+  __m128i vec ;
+  __m128 vecf ;
+  uint32_t masks[4] ;
+} MaskVec_F ;
+
+typedef union ALIGNED
+{
+        ALIGNED __m128i ALIGNED i;
+        ALIGNED __m128 ALIGNED f;
+} ALIGNED IF_128f ALIGNED;
+
+typedef union ALIGNED
+{
+        ALIGNED int    ALIGNED i;
+        ALIGNED float  ALIGNED f;
+} ALIGNED IF_32 ALIGNED;
+
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m256d ALIGNED d;
+        ALIGNED __m128i ALIGNED s[2];
+        ALIGNED double  ALIGNED f[4];
+        ALIGNED __m256i ALIGNED i;
+} ALIGNED mix_D ALIGNED;
+
+typedef union ALIGNED {
+  __m128i vec ;
+  __m128d vecf ;
+  uint64_t masks[2] ;
+} MaskVec_D ;
+
+typedef union ALIGNED
+{
+        ALIGNED __m128i ALIGNED i;
+        ALIGNED __m128d ALIGNED f;
+} ALIGNED IF_128d ALIGNED;
+
+typedef union ALIGNED
+{
+        ALIGNED int64_t ALIGNED i;
+        ALIGNED double  ALIGNED f;
+} ALIGNED IF_64 ALIGNED;
+
+template<class T>
+struct Context{};
+
+template<>
+struct Context<double>
+{
+        Context()
+        {
+                for (int x = 0; x < 128; x++)
+                        ph2pr[x] = pow(10.0, -((double)x) / 10.0);
+
+                INITIAL_CONSTANT = ldexp(1.0, 1020.0);
+                LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
+                RESULT_THRESHOLD = 0.0;
+        }
+
+        double LOG10(double v){ return log10(v); }
+
+        static double _(double n){ return n; }
+        static double _(float n){ return ((double) n); }
+        double ph2pr[128];
+        double INITIAL_CONSTANT;
+        double LOG10_INITIAL_CONSTANT;
+        double RESULT_THRESHOLD;
+};
+
+template<>
+struct Context<float>
+{
+        Context()
+        {
+                for (int x = 0; x < 128; x++)
+                {
+                        ph2pr[x] = powf(10.f, -((float)x) / 10.f);
+                }
+
+                INITIAL_CONSTANT = ldexpf(1.f, 120.f);
+                LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
+                RESULT_THRESHOLD = ldexpf(1.f, -110.f);
+        }
+
+        float LOG10(float v){ return log10f(v); }
+
+        static float _(double n){ return ((float) n); }
+        static float _(float n){ return n; }
+        float ph2pr[128];
+        float INITIAL_CONSTANT;
+        float LOG10_INITIAL_CONSTANT;
+        float RESULT_THRESHOLD;
+};
+
+
+typedef struct
+{
+        int rslen, haplen, *q, *i, *d, *c;
+        char *hap, *rs;
+        int *ihap;
+        int *irs;
+} testcase;
+
+int normalize(char c);
+
+
+int read_testcase(testcase *tc)
+{
+        char *q, *i, *d, *c, *line = NULL;
+        int _q, _i, _d, _c;
+        int x, size = 0;
+        ssize_t read;
+
+        read = getline(&line, (size_t *) &size, stdin);
+        if (read == -1)
+                return -1;
+
+        tc->hap = (char *) malloc(size);
+        tc->rs = (char *) malloc(size);
+        q = (char *) malloc(size);
+        i = (char *) malloc(size);
+        d = (char *) malloc(size);
+        c = (char *) malloc(size);
+        tc->ihap = (int *) malloc(size*sizeof(int));
+        tc->irs = (int *) malloc(size*sizeof(int));
+
+
+        if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+                return -1;
+
+        tc->haplen = strlen(tc->hap);
+        tc->rslen = strlen(tc->rs);
+        tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+        tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+        tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+        tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+        tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
+        tc->irs = (int *) malloc(tc->rslen*sizeof(int));
+
+        for (x = 0; x < tc->rslen; x++)
+        {
+                _q = normalize(q[x]);
+                _i = normalize(i[x]);
+                _d = normalize(d[x]);
+                _c = normalize(c[x]);
+                tc->q[x] = (_q < 6) ? 6 : _q;
+                tc->i[x] = _i;
+                tc->d[x] = _d;
+                tc->c[x] = _c;
+                tc->irs[x] = tc->rs[x];
+        }
+
+        for (x = 0; x < tc->haplen; x++)
+                tc->ihap[x] = tc->hap[x];
+
+        free(q);
+        free(i);
+        free(d);
+        free(c);
+        free(line);
+
+        return 0;
+}
+
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+class ConvertChar {
+
+  static uint8_t conversionTable[255] ;
+
+public:
+
+  static void init() {
+    assert (NUM_DISTINCT_CHARS == 5) ;
+    assert (AMBIG_CHAR == 4) ;
+
+    conversionTable['A'] = 0 ;
+    conversionTable['C'] = 1 ;
+    conversionTable['T'] = 2 ;
+    conversionTable['G'] = 3 ;
+    conversionTable['N'] = 4 ;
+  }
+
+  static inline uint8_t get(uint8_t input) {
+    return conversionTable[input] ;
+  }
+
+} ;
+
+
+#endif
+
+

From 8240ea826ebe1f47ba7548cb8971f4d5803b0fbb Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Wed, 15 Jan 2014 10:48:58 -0800
Subject: [PATCH 2/6] Changes: 1. Added TRISTATE_CORRECTION in
 pairhmm-template-kernel.cc (function stripINITIALIZATION) 2. Added VEC_DIV
 macros to define-double.h and define-float.h 3. Edited initializeVectors to
 match Java C++ original: *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) :
 ctx.ph2pr[_d]; *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) :
 ctx.ph2pr[_c]; Modified: *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; *(ptr_p_YY+r-1) =
 ctx.ph2pr[_c];

---
 .gitignore                                    |  2 ++
 Makefile                                      | 19 +++++++------
 define-double.h                               |  4 +++
 define-float.h                                |  4 +++
 hmm_mask.cc                                   |  3 +-
 ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 28 +++++++++----------
 pairhmm-template-kernel.cc                    | 24 ++++++++++++++--
 pairhmm-template-main.cc                      |  2 +-
 template.h                                    | 15 ++++++++++
 9 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index 16e86f25d..dcc44daa4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@
 tests
 .deps
 hmm_Mohammad
+pairhmm-template-main
+*.swp
diff --git a/Makefile b/Makefile
index df5dbc463..f858a7241 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-#OMPCFLAGS=-fopenmp
+OMPCFLAGS=-fopenmp
 #OMPLDFLAGS=-lgomp
 
 #CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q            -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
@@ -7,7 +7,7 @@
 JAVA_ROOT=/opt/jdk1.7.0_25/
 JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux
 
-CFLAGS=-g -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
+CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
 
 CXXFLAGS=$(CFLAGS)
 CC=icc
@@ -16,11 +16,12 @@ CXX=icc
 LDFLAGS=-lm $(OMPLDFLAGS)
 
 #BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
-BIN:=libJNILoglessPairHMM.so #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
+BIN:=libJNILoglessPairHMM.so pairhmm-template-main
 
 #SOURCES=pairhmm-1-base.cc input.cc 
-SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc
-OBJECTS=$(SOURCES:.cc=.o)
+LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc
+SOURCES=$(LIBSOURCES) pairhmm-template-main.cc
+LIBOBJECTS=$(LIBSOURCES:.cc=.o)
 DEPDIR=.deps
 DF=$(DEPDIR)/$(*).d
 
@@ -28,11 +29,11 @@ all: $(BIN)
 
 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
 
-pairhmm-1-base:	pairhmm-1-base.o input.o
-	$(CXX) -o $@ $^ $(LDFLAGS)
+pairhmm-template-main:	pairhmm-template-main.o hmm_mask.o
+	$(CXX) -fopenmp -o $@ $^ $(LDFLAGS)
 
-libJNILoglessPairHMM.so: $(OBJECTS) 
-	$(CXX) -shared -o libJNILoglessPairHMM.so $(OBJECTS) 
+libJNILoglessPairHMM.so: $(LIBOBJECTS) 
+	$(CXX) -shared -o $@ $(LIBOBJECTS) 
 
 %.o: %.cc
 	@mkdir -p $(DEPDIR)
diff --git a/define-double.h b/define-double.h
index b2de7520a..5d31741d6 100644
--- a/define-double.h
+++ b/define-double.h
@@ -23,6 +23,7 @@
         #undef VEC_ADD
 	#undef VEC_SUB
         #undef VEC_MUL
+        #undef VEC_DIV
         #undef VEC_BLEND
         #undef VEC_BLENDV
 	#undef VEC_CAST_256_128
@@ -82,6 +83,9 @@
 #define VEC_MUL(__v1, __v2)                     \
   _mm256_mul_pd(__v1, __v2)
 
+#define VEC_DIV(__v1, __v2)                     \
+  _mm256_div_pd(__v1, __v2)
+
 #define VEC_BLEND(__v1, __v2, __mask)           \
   _mm256_blend_pd(__v1, __v2, __mask)
 
diff --git a/define-float.h b/define-float.h
index 4cba9a179..0e2822b2d 100644
--- a/define-float.h
+++ b/define-float.h
@@ -23,6 +23,7 @@
 	#undef VEC_ADD(__v1, __v2)
         #undef VEC_SUB(__v1, __v2)
 	#undef VEC_MUL(__v1, __v2)
+	#undef VEC_DIV(__v1, __v2)
 	#undef VEC_BLEND(__v1, __v2, __mask)
 	#undef VEC_BLENDV(__v1, __v2, __maskV)
         #undef VEC_CAST_256_128(__v1)
@@ -83,6 +84,9 @@
 #define VEC_MUL(__v1, __v2)                     \
   _mm256_mul_ps(__v1, __v2)
 
+#define VEC_DIV(__v1, __v2)                     \
+  _mm256_div_ps(__v1, __v2)
+
 #define VEC_BLEND(__v1, __v2, __mask)           \
   _mm256_blend_ps(__v1, __v2, __mask)
 
diff --git a/hmm_mask.cc b/hmm_mask.cc
index aace7acf2..f128a39d9 100644
--- a/hmm_mask.cc
+++ b/hmm_mask.cc
@@ -438,7 +438,7 @@ void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {
   //cout << "Finished validating entry " << endl ;
 }
 
-
+#ifdef HMM_MASK_MAIN
 int main () {
 
   #define BATCH_SIZE 10000
@@ -482,3 +482,4 @@ int main () {
 
   return 0 ;
 }
+#endif
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
index 80065b8c9..29668a306 100644
--- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@@ -15,6 +15,10 @@
 #include <immintrin.h>
 #include <emmintrin.h>
 #include <omp.h>
+using namespace std;
+//#define DEBUG3 1
+#define DEBUG 1
+
 
 #include "template.h"
 
@@ -22,7 +26,6 @@
 #include "shift_template.c"
 #include "pairhmm-template-kernel.cc"
 
-using namespace std;
 
 
 #define MM 0
@@ -32,22 +35,17 @@ using namespace std;
 #define MY 4
 #define YY 5
 
-//#define DEBUG3 1
-#define DEBUG 1
-
-template<class T>
-string to_string(T obj)
+class LoadTimeInitializer
 {
-  stringstream ss;
-  string ret_string;
-  ss.clear();
-  ss << std::scientific << obj;
-  ss >> ret_string;
-  ss.clear();
-  return ret_string;
-}
+  public:
+    LoadTimeInitializer()		//will be called when library is loaded
+    {
+      ConvertChar::init();
+    }
+};
+LoadTimeInitializer g_load_time_initializer;
 
-void debug_dump(string filename, string s, bool to_append, bool add_newline=true)
+void debug_dump(string filename, string s, bool to_append, bool add_newline)
 {
   ofstream fptr;
   fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
diff --git a/pairhmm-template-kernel.cc b/pairhmm-template-kernel.cc
index dd71373b0..24bcf7d1a 100644
--- a/pairhmm-template-kernel.cc
+++ b/pairhmm-template-kernel.cc
@@ -157,8 +157,18 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
                 *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
                 *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
                 *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
-                *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-                *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+                *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
+                *(ptr_p_YY+r-1) = ctx.ph2pr[_c];
+#ifdef DEBUG3
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1)  ),true);
+#endif
+		//*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
         }
 
         NUMBER *ptr_distm1D = (NUMBER *)distm1D;
@@ -166,6 +176,9 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
         {
                 int _q = tc->q[r-1] & 127;
                 ptr_distm1D[r-1] = ctx.ph2pr[_q];
+#ifdef DEBUG3
+		debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true);
+#endif
         }
 }
 
@@ -187,13 +200,18 @@ template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)
 	NUMBER zero = ctx._(0.0);        
 	NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
 	UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+#define TRISTATE_CORRECTION_FACTOR 3.0
+	UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR);
 	
         /* compare rs and N */                                           
         //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
         //rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
                                                                          
         distm = distm1D[i];                                    
-        _1_distm = VEC_SUB(packed1.d, distm);                  
+        _1_distm = VEC_SUB(packed1.d, distm);
+#ifndef DO_NOT_USE_TRISTATE_CORRECTION
+	distm = VEC_DIV(distm, packed3.d); 
+#endif
                                                                          
         /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */        
         M_t_2.d = VEC_SET1_VAL(zero);                                    
diff --git a/pairhmm-template-main.cc b/pairhmm-template-main.cc
index f20755b6b..87d50eb5b 100644
--- a/pairhmm-template-main.cc
+++ b/pairhmm-template-main.cc
@@ -16,7 +16,7 @@
 #define BATCH_SIZE  10000
 //#define RUN_HYBRID
 
-uint8_t ConvertChar::conversionTable[255] ;
+//uint8_t ConvertChar::conversionTable[255] ;
 int thread_level_parallelism_enabled = false ;
 
 double getCurrClk() {
diff --git a/template.h b/template.h
index ee8036a11..4c8a4b5e3 100644
--- a/template.h
+++ b/template.h
@@ -13,6 +13,7 @@
 #include <immintrin.h>
 
 #include <ctype.h>
+#include <string>
 
 #define MROWS  500
 #define MCOLS  1000
@@ -132,6 +133,20 @@ typedef struct
         int *irs;
 } testcase;
 
+
+template<class T>
+std::string to_string(T obj)
+{
+  stringstream ss;
+  string ret_string;
+  ss.clear();
+  ss << std::scientific << obj;
+  ss >> ret_string;
+  ss.clear();
+  return ret_string;
+}
+void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
+
 int normalize(char c);
 
 

From e90405cd1fd1f8d824194f91f971aa32da667cc7 Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Thu, 16 Jan 2014 19:53:50 -0800
Subject: [PATCH 3/6] 1. Nested loops over reads and haplotypes moved to C++
 through JNI 2. OpenMP support added 3. Using direct access to Java primitive
 arrays 4. Debug messages disabled

---
 .gitignore                                    |   2 +
 Makefile                                      |  17 +-
 convert_char.cc                               | 186 ++++++++
 hmm_mask.cc                                   |  60 +--
 ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 417 +++++++++++++++---
 ...te_sting_utils_pairhmm_JNILoglessPairHMM.h |  15 +
 pairhmm-1-base.cc                             | 324 ++++++--------
 pairhmm-template-main.cc                      |   8 +-
 template.h                                    |  79 +---
 9 files changed, 737 insertions(+), 371 deletions(-)
 create mode 100644 convert_char.cc

diff --git a/.gitignore b/.gitignore
index dcc44daa4..cb70b53a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@ tests
 hmm_Mohammad
 pairhmm-template-main
 *.swp
+checker
+reformat
diff --git a/Makefile b/Makefile
index f858a7241..70bf96dd3 100644
--- a/Makefile
+++ b/Makefile
@@ -16,11 +16,11 @@ CXX=icc
 LDFLAGS=-lm $(OMPLDFLAGS)
 
 #BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
-BIN:=libJNILoglessPairHMM.so pairhmm-template-main
+BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker
 
 #SOURCES=pairhmm-1-base.cc input.cc 
-LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc hmm_mask.cc
-SOURCES=$(LIBSOURCES) pairhmm-template-main.cc
+LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc
+SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
 LIBOBJECTS=$(LIBSOURCES:.cc=.o)
 DEPDIR=.deps
 DF=$(DEPDIR)/$(*).d
@@ -29,11 +29,14 @@ all: $(BIN)
 
 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
 
-pairhmm-template-main:	pairhmm-template-main.o hmm_mask.o
-	$(CXX) -fopenmp -o $@ $^ $(LDFLAGS)
+checker: pairhmm-1-base.o convert_char.o
+	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
+
+pairhmm-template-main:	pairhmm-template-main.o convert_char.o
+	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
 
 libJNILoglessPairHMM.so: $(LIBOBJECTS) 
-	$(CXX) -shared -o $@ $(LIBOBJECTS) 
+	$(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) 
 
 %.o: %.cc
 	@mkdir -p $(DEPDIR)
@@ -41,4 +44,4 @@ libJNILoglessPairHMM.so: $(LIBOBJECTS)
 
 
 clean:
-	rm -f $(BIN) *.o
+	rm -rf $(BIN) *.o $(DEPDIR)
diff --git a/convert_char.cc b/convert_char.cc
new file mode 100644
index 000000000..01c5a5137
--- /dev/null
+++ b/convert_char.cc
@@ -0,0 +1,186 @@
+#include "template.h"
+uint8_t ConvertChar::conversionTable[255];
+using namespace std;
+
+int normalize(char c)
+{
+	return ((int) (c - 33));
+}
+
+int read_testcase(testcase *tc, FILE* ifp)
+{
+	char *q, *i, *d, *c, *line = NULL;
+	int _q, _i, _d, _c;
+	int x, size = 0;
+	ssize_t read;
+
+	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
+	if (read == -1)
+		return -1;
+
+
+	tc->hap = (char *) malloc(size);
+	tc->rs = (char *) malloc(size);
+	q = (char *) malloc(size);
+	i = (char *) malloc(size);
+	d = (char *) malloc(size);
+	c = (char *) malloc(size);
+
+	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+		return -1;
+
+	tc->haplen = strlen(tc->hap);
+	tc->rslen = strlen(tc->rs);
+	assert(tc->rslen < MROWS);
+	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
+	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
+
+	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+	for (x = 0; x < tc->rslen; x++)
+	{
+		_q = normalize(q[x]);
+		_i = normalize(i[x]);
+		_d = normalize(d[x]);
+		_c = normalize(c[x]);
+		tc->q[x] = (_q < 6) ? 6 : _q;
+		tc->i[x] = _i;
+		tc->d[x] = _d;
+		tc->c[x] = _c;
+		tc->irs[x] = tc->rs[x];
+	}
+	for (x = 0; x < tc->haplen; x++)
+	  tc->ihap[x] = tc->hap[x];
+
+
+	free(q);
+	free(i);
+	free(d);
+	free(c);
+	free(line);
+
+	return 0;
+}
+
+unsigned MAX_LINE_LENGTH = 65536;
+int convToInt(std::string s)
+{
+  int i;
+  std::istringstream strin(s);
+  strin >> i;
+  return i;
+}
+
+void tokenize(std::ifstream& fptr, std::vector<std::string>& tokens)
+{
+  int i = 0;
+  std::string tmp;
+  std::vector<std::string> myVec;
+  vector<char> line;
+  line.clear();
+  line.resize(MAX_LINE_LENGTH);
+  vector<char> tmpline;
+  tmpline.clear();
+  tmpline.resize(MAX_LINE_LENGTH);
+  myVec.clear();
+
+  while(!fptr.eof())
+  {
+    i = 0;
+    bool still_read_line = true;
+    unsigned line_position = 0;
+    while(still_read_line)
+    {
+      fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH);
+      if(line_position + MAX_LINE_LENGTH > line.size())
+	line.resize(2*line.size());
+      for(unsigned i=0;i<MAX_LINE_LENGTH && tmpline[i] != '\0';++i,++line_position)
+	line[line_position] = tmpline[i];
+      if(fptr.eof() || !fptr.fail()) 
+      {
+	still_read_line = false;
+	line[line_position++] = '\0';
+      }
+    }
+    std::istringstream kap(&(line[0]));
+
+    while(!kap.eof())
+    {
+      kap >> std::skipws >> tmp;
+      if(tmp != "")
+      {
+	myVec.push_back(tmp);
+	++i;
+	//std::cout <<tmp <<"#";
+      }
+      tmp = "";
+    }
+    //std::cout << "\n";
+    if(myVec.size() > 0)
+      break;
+  }
+  tokens.clear();
+  //std::cout << "Why "<<myVec.size()<<"\n";
+  tokens.resize(myVec.size());
+  for(i=0;i<(int)myVec.size();++i)
+    tokens[i] = myVec[i];
+  line.clear();
+  tmpline.clear();
+}
+
+int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
+{
+  static bool first_call = true;
+  vector<string> tokens;
+  tokens.clear();
+  tokenize(fptr, tokens);
+  if(tokens.size() == 0)
+    return -1;
+  tc->hap = new char[tokens[0].size()+2];
+  tc->haplen = tokens[0].size();
+  memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
+  tc->rs = new char[tokens[1].size()+2];
+  tc->rslen = tokens[1].size();
+  //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
+  memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
+  assert(tokens.size() == 2 + 4*(tc->rslen));
+  assert(tc->rslen < MROWS);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]);
+ 
+  if(reformat)
+  {
+    ofstream ofptr;
+    ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app);
+    assert(ofptr.is_open());
+    ofptr << tokens[0] << " ";
+    ofptr << tokens[1] << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->q[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->i[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->d[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->c[j]+33));
+    ofptr << " 0 false\n";
+
+    ofptr.close();
+    first_call = false;
+  }
+
+
+  return tokens.size();
+}
diff --git a/hmm_mask.cc b/hmm_mask.cc
index f128a39d9..c335a0160 100644
--- a/hmm_mask.cc
+++ b/hmm_mask.cc
@@ -30,9 +30,7 @@
 #include <set>
 #include <map>
 #include <memory.h>
-#include <string>
-#include <sstream>
-#include <fstream>
+#include "template.h"
 
 using namespace std ;
 
@@ -50,43 +48,14 @@ string getBinaryStr (T val, int numBitsToWrite) {
   return oss.str() ;
 }
 
-
-typedef struct 
-{
-	int rslen, haplen, *q, *i, *d, *c;
-	char *hap, *rs;
-} testcase;
-
 int normalize(char c)
 {
 	return ((int) (c - 33));
 }
 
-class ConvertChar {
-
-  static uint8_t conversionTable[255] ;
-
-
-public:
-
-  static void init() {
-    conversionTable['A'] = 0 ;
-    conversionTable['C'] = 1 ;
-    conversionTable['T'] = 2 ;
-    conversionTable['G'] = 3 ;
-    conversionTable['N'] = 4 ;
-  }
-
-  static inline uint8_t get(uint8_t input) {
-    return conversionTable[input] ;
-  }
-
-} ;
 
 uint8_t ConvertChar::conversionTable[255] ;
 
-
-
 int read_testcase(testcase *tc, FILE* ifp)
 {
 	char *q, *i, *d, *c, *line = NULL;
@@ -94,7 +63,7 @@ int read_testcase(testcase *tc, FILE* ifp)
 	int x, size = 0;
 	ssize_t read;
 
-	read = getline(&line, (size_t *) &size, ifp);
+	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
 	if (read == -1)
 		return -1;
 
@@ -111,20 +80,14 @@ int read_testcase(testcase *tc, FILE* ifp)
 
 	tc->haplen = strlen(tc->hap);
 	tc->rslen = strlen(tc->rs);
-	tc->q = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->i = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->d = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+	assert(tc->rslen < MROWS);
+	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
+	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
 
-	// Convert hap and rs to 3 bits
-	
-	/*
-	for (int ci=0; ci < tc->haplen; ++ci)
-	  tc->hap[ci] = ConvertChar::get(tc->hap[ci]) ;
-
-	for (int ci=0; ci < tc->rslen; ++ci)
-	  tc->rs[ci] = ConvertChar::get(tc->rs[ci]) ;
-	*/
+	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
 
 	for (x = 0; x < tc->rslen; x++)
 	{
@@ -136,8 +99,11 @@ int read_testcase(testcase *tc, FILE* ifp)
 		tc->i[x] = _i;
 		tc->d[x] = _d;
 		tc->c[x] = _c;
+		tc->irs[x] = tc->rs[x];
 	}
-	
+	for (x = 0; x < tc->haplen; x++)
+	  tc->ihap[x] = tc->hap[x];
+
 
 	free(q);
 	free(i);
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
index 29668a306..b8298161e 100644
--- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@@ -2,6 +2,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
+#include <vector>
 
 #include <cstdio>
 #include <string>
@@ -12,12 +13,14 @@
 #include <fstream>
 #include <sstream>
 
+
 #include <immintrin.h>
 #include <emmintrin.h>
 #include <omp.h>
 using namespace std;
+//#define DEBUG 1
+//#define DEBUG0_1 1
 //#define DEBUG3 1
-#define DEBUG 1
 
 
 #include "template.h"
@@ -42,6 +45,12 @@ class LoadTimeInitializer
     {
       ConvertChar::init();
     }
+    jfieldID m_readBasesFID;
+    jfieldID m_readQualsFID;
+    jfieldID m_insertionGOPFID;
+    jfieldID m_deletionGOPFID;
+    jfieldID m_overallGCPFID;
+    jfieldID m_haplotypeBasesFID;
 };
 LoadTimeInitializer g_load_time_initializer;
 
@@ -55,73 +64,6 @@ void debug_dump(string filename, string s, bool to_append, bool add_newline)
   fptr.close();
 }
 
-#define INT_STORE_ARRAY_SIZE 2048
-int insertionGOPIntArray[INT_STORE_ARRAY_SIZE];
-int deletionGOPIntArray[INT_STORE_ARRAY_SIZE];
-int overallGCPIntArray[INT_STORE_ARRAY_SIZE];
-int readQualsIntArray[INT_STORE_ARRAY_SIZE];
-
-JNIEXPORT jdouble JNICALL 
-Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( 
-    JNIEnv* env, jobject thisObject,
-    jint readLength, jint haplotypeLength,
-    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
-    jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP,
-    jint hapStartIndex
-    )
-{
-  jboolean is_copy = JNI_FALSE;
-  jbyte* readBasesArray = (env)->GetByteArrayElements(readBases, &is_copy);
-  jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy);
-  jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy);
-  jbyte* insertionGOPArray = (env)->GetByteArrayElements(insertionGOP, &is_copy);
-  jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy);
-  jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy);
-#ifdef DEBUG
-  assert(readBasesArray && "readBasesArray not initialized in JNI"); 
-  assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
-  assert(readQualsArray && "readQualsArray not initialized in JNI"); 
-  assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
-  assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
-  assert(overallGCPArray && "OverallGCP array not initialized in JNI");
-  assert(readLength < INT_STORE_ARRAY_SIZE);
-#endif
-  for(unsigned i=0;i<readLength;++i)
-  {
-    insertionGOPIntArray[i] = (int)insertionGOPArray[i];
-    deletionGOPIntArray[i] = (int)deletionGOPArray[i];
-    overallGCPIntArray[i] = (int)overallGCPArray[i];
-    readQualsIntArray[i] = (int)readQualsArray[i];
-  }
-
-  testcase tc;
-  tc.rslen = readLength;
-  tc.haplen = haplotypeLength;
-
-  tc.rs = (char*)readBasesArray;
-  tc.hap = (char*)haplotypeBasesArray;
-  tc.q = (int*)readQualsIntArray;
-  tc.i = (int*)insertionGOPIntArray;
-  tc.d = (int*)deletionGOPIntArray;
-  tc.c = (int*)overallGCPIntArray;
-
-  double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
-  double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
-#ifdef DEBUG
-  debug_dump("return_values_jni.txt",to_string(result),true);
-#endif
-
-
-  env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT);
-  env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT);
-  env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT);
-  env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT);
-  env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT);
-  env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT);
-
-  return 0.0;
-}
-
 JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
 (JNIEnv* env, jclass thisObject,
  jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP
@@ -142,3 +84,342 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior
     )
 
 { return 0.0; }
+
+#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
+
+#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
+//Gets direct access to Java arrays
+#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
+#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
+#define JNI_RELEASE_MODE JNI_ABORT
+#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
+#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
+
+#else
+//Likely makes copy of Java arrays to JNI C++ space
+#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements
+#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements
+#define JNI_RELEASE_MODE JNI_ABORT
+#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements
+#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements
+
+#endif		//ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
+
+JNIEXPORT jdouble JNICALL 
+Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( 
+    JNIEnv* env, jobject thisObject,
+    jint readLength, jint haplotypeLength,
+    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
+    jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP,
+    jint hapStartIndex
+    )
+{
+  jboolean is_copy = JNI_FALSE;
+  jbyte* readBasesArray =   (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy);
+  jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
+  jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy);
+  jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy);
+  jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy);
+  jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy);
+#ifdef DEBUG
+  assert(readBasesArray && "readBasesArray not initialized in JNI"); 
+  assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+  assert(readQualsArray && "readQualsArray not initialized in JNI"); 
+  assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
+  assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
+  assert(overallGCPArray && "OverallGCP array not initialized in JNI");
+  assert(readLength < MROWS);
+#endif
+  testcase tc;
+  tc.rslen = readLength;
+  tc.haplen = haplotypeLength;
+  tc.rs = (char*)readBasesArray;
+  tc.hap = (char*)haplotypeBasesArray;
+  for(unsigned i=0;i<readLength;++i)
+  {
+    tc.q[i] = (int)readQualsArray[i];
+    tc.i[i] = (int)insertionGOPArray[i];
+    tc.d[i] = (int)deletionGOPArray[i];
+    tc.c[i] = (int)overallGCPArray[i];
+  }
+
+  double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+  double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+#ifdef DEBUG
+  debug_dump("return_values_jni.txt",to_string(result),true);
+#endif
+
+
+  RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE);
+
+  return 0.0;
+}
+
+//Should be called only once for the whole Java process - initializes field ids for the classes JNIReadDataHolderClass
+//and JNIHaplotypeDataHolderClass
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
+  (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass)
+{
+  assert(readDataHolderClass);
+  assert(haplotypeDataHolderClass);
+  jfieldID fid;
+  fid = env->GetFieldID(readDataHolderClass, "readBases", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for readBases");
+  g_load_time_initializer.m_readBasesFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "readQuals", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for readQuals");
+  g_load_time_initializer.m_readQualsFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "insertionGOP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for insertionGOP");
+  g_load_time_initializer.m_insertionGOPFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "deletionGOP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for deletionGOP");
+  g_load_time_initializer.m_deletionGOPFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "overallGCP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for overallGCP");
+  g_load_time_initializer.m_overallGCPFID = fid;
+
+  fid = env->GetFieldID(haplotypeDataHolderClass, "haplotypeBases", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for haplotypeBases");
+  g_load_time_initializer.m_haplotypeBasesFID = fid;
+}
+
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
+  (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, 
+   jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse)
+{
+  jboolean is_copy = JNI_FALSE;
+  //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
+  vector<pair<jbyteArray, jbyte*> > haplotypeBasesArrayVector;
+  haplotypeBasesArrayVector.clear();
+  haplotypeBasesArrayVector.resize(numHaplotypes);
+#ifdef DEBUG0_1
+  cout << "JNI numReads "<<numReads<<" numHaplotypes "<<numHaplotypes<<"\n";
+#endif
+  for(unsigned j=0;j<numHaplotypes;++j)
+  {
+    jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
+    jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
+#ifdef DEBUG
+    assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
+#endif
+    jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
+#ifdef DEBUG
+    assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+    assert(env->GetArrayLength(haplotypeBases) < MCOLS);
+#ifdef DEBUG0_1
+    cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBases)<<"\n";
+#endif
+#endif
+    haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray);
+#ifdef DEBUG3
+    for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
+      debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
+#endif
+  }
+
+  unsigned numTestCases = numReads*numHaplotypes;
+  vector<testcase> tc_array;
+  tc_array.clear();
+  tc_array.resize(numTestCases);
+  unsigned tc_idx = 0;
+  //Store arrays for release later
+  vector<vector<pair<jbyteArray,jbyte*> > > readBasesArrayVector;
+  readBasesArrayVector.clear();
+  readBasesArrayVector.resize(numReads);
+  for(unsigned i=0;i<numReads;++i)
+  {
+    //Get bytearray fields from read
+    jobject readObject = env->GetObjectArrayElement(readDataArray, i);
+    jbyteArray readBases = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readBasesFID);
+    jbyteArray insertionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_insertionGOPFID);
+    jbyteArray deletionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_deletionGOPFID);
+    jbyteArray overallGCP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_overallGCPFID);
+    jbyteArray readQuals = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readQualsFID);
+
+#ifdef DEBUG
+    assert(readBases && ("readBases is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(insertionGOP && ("insertionGOP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(deletionGOP && ("deletionGOP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(overallGCP && ("overallGCP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(readQuals && ("readQuals is NULL at index : "+to_string(i)+"\n").c_str());
+#endif
+    jsize readLength = env->GetArrayLength(readBases);
+
+    jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy);	//order of GET-RELEASE is important
+    jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy);
+    jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy);
+    jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy);
+    jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy);
+#ifdef DEBUG
+    assert(readBasesArray && "readBasesArray not initialized in JNI"); 
+    assert(readQualsArray && "readQualsArray not initialized in JNI"); 
+    assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
+    assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
+    assert(overallGCPArray && "overallGCP array not initialized in JNI");
+    assert(readLength < MROWS); 
+    assert(readLength == env->GetArrayLength(readQuals));
+    assert(readLength == env->GetArrayLength(insertionGOP));
+    assert(readLength == env->GetArrayLength(deletionGOP));
+    assert(readLength == env->GetArrayLength(overallGCP));
+#ifdef DEBUG0_1
+    cout << "JNI read length "<<readLength<<"\n";
+#endif
+#endif
+#ifdef DEBUG3
+    for(unsigned j=0;j<readLength;++j)
+    {
+      debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
+    }
+#endif
+
+    for(unsigned j=0;j<numHaplotypes;++j)
+    {
+      jsize haplotypeLength = env->GetArrayLength(haplotypeBasesArrayVector[j].first);
+      jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
+      tc_array[tc_idx].rslen = (int)readLength;
+      tc_array[tc_idx].haplen = (int)haplotypeLength;
+      tc_array[tc_idx].rs = (char*)readBasesArray;
+      tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
+      //Can be avoided 
+      for(unsigned k=0;k<readLength;++k)
+      {
+	tc_array[tc_idx].q[k] = (int)readQualsArray[k];
+	tc_array[tc_idx].i[k] = (int)insertionGOPArray[k];
+	tc_array[tc_idx].d[k] = (int)deletionGOPArray[k];
+	tc_array[tc_idx].c[k] = (int)overallGCPArray[k];
+      }
+
+      ++tc_idx;  
+    }
+    RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);	//order of GET-RELEASE is important
+    RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
+    RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
+    RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
+
+    //Release readBases at end because it is used by compute_full_prob
+    readBasesArrayVector[i].clear();
+    readBasesArrayVector[i].resize(1);
+    readBasesArrayVector[i][0] = make_pair(readBases, readBasesArray);
+  }
+
+  jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy);
+  assert(likelihoodDoubleArray && "likelihoodArray is NULL");
+  assert(env->GetArrayLength(likelihoodArray) == numTestCases);
+#pragma omp parallel for schedule (guided,2) private(tc_idx) num_threads(maxNumThreadsToUse) 
+  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
+  {
+    double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
+    double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+    likelihoodDoubleArray[tc_idx] = result;
+  }
+#ifdef DEBUG
+  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
+  {
+    debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
+  }
+#endif
+  RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0);
+
+  //Release read arrays first
+  for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
+  {
+    for(int j=readBasesArrayVector.size()-1;j>=0;--j)
+      RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE);
+    readBasesArrayVector[i].clear();
+  }
+  readBasesArrayVector.clear();
+
+  //Now release haplotype arrays
+  for(int j=numHaplotypes-1;j>=0;--j)	//note the order - reverse of GET
+    RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE);
+  haplotypeBasesArrayVector.clear();
+  tc_array.clear();
+}
+
+
+
+
+    /*
+    protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
+	    final byte[] readBases,
+	    final byte[] readQuals,
+	    final byte[] insertionGOP,
+	    final byte[] deletionGOP,
+	    final byte[] overallGCP,
+	    final boolean recacheReadValues,
+	    final byte[] nextHaploytpeBases) {
+
+	paddedReadLength = readBases.length + 1;
+	paddedHaplotypeLength = haplotypeBases.length + 1;
+	double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, true, 0);
+	if ( ! MathUtils.goodLog10Probability(result) )
+	    throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
+	// Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of).
+	// Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper.
+	// KG: seems pointless is not being used anywhere
+	previousHaplotypeBases = haplotypeBases;
+	return result;
+	return 0;
+    }*/
+
+//public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap)
+    //{
+	////PerReadAlleleLikelihoodMap retValue = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
+	//// (re)initialize the pairHMM only if necessary
+	//final int readMaxLength = findMaxReadLength(reads);
+	//final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
+	//if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
+	//{ initialize(readMaxLength, haplotypeMaxLength); }
+
+	//if ( ! initialized )
+	    //throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods");
+	//final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
+	//jniComputeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap, likelihoodMap);
+	//for(GATKSAMRecord read : reads){
+	    //final byte[] readBases = read.getReadBases();
+	    //final byte[] readQuals = read.getBaseQualities();
+	    //final byte[] readInsQuals = read.getBaseInsertionQualities();
+	    //final byte[] readDelQuals = read.getBaseDeletionQualities();
+	    //final byte[] overallGCP = GCPArrayMap.get(read);
+
+	    //// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
+	    //byte[] currentHaplotypeBases = null;
+	    //boolean isFirstHaplotype = true;
+	    //Allele currentAllele = null;
+	    //double log10l;
+	    //for (final Allele allele : alleleHaplotypeMap.keySet()){
+		//final Haplotype haplotype = alleleHaplotypeMap.get(allele);
+		//final byte[] nextHaplotypeBases = haplotype.getBases();
+		//if (currentHaplotypeBases != null) {
+		    //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
+			    //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
+		    //likelihoodMap.add(read, currentAllele, log10l);
+		//}
+		//// update the current haplotype
+		//currentHaplotypeBases = nextHaplotypeBases;
+		//currentAllele = allele;
+	    //}
+	    //// process the final haplotype
+	    //if (currentHaplotypeBases != null) {
+
+		//// there is no next haplotype, so pass null for nextHaplotypeBases.
+		//log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
+			//readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
+		//likelihoodMap.add(read, currentAllele, log10l);
+	    //}
+	//}
+	//if(debug)
+	    //debugClose();
+	//return likelihoodMap;
+    //}
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
index 5408d2f57..1fff9fa5a 100644
--- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
@@ -54,6 +54,21 @@ JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILogless
 JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10
     (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
 
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniGlobalInit
+ * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
+  (JNIEnv *, jobject, jclass, jclass);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniComputeLikelihoods
+ * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
+  (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint);
 
 #ifdef __cplusplus
 }
diff --git a/pairhmm-1-base.cc b/pairhmm-1-base.cc
index dacd02536..90f95aeec 100644
--- a/pairhmm-1-base.cc
+++ b/pairhmm-1-base.cc
@@ -1,10 +1,6 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-
-#include <sys/time.h>
-
+//#define DEBUG 1
+//#define DEBUG0_1 1
+//#define DEBUG3 1
 #define MM 0
 #define GapM 1
 #define MX 2
@@ -12,123 +8,34 @@
 #define MY 4
 #define YY 5
 
-//#define DEBUG
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <omp.h>
+#include <emmintrin.h>
+#include "template.h"
 
-/*
-	q: read quality
-	i: insertion penalty
-	d: deletion penalty
-	c: gap continuation penalty
-*/
+//#include "define-float.h"
+//#include "shift_template.c"
+//#include "pairhmm-template-kernel.cc"
 
-typedef struct 
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+
+using namespace std;
+class LoadTimeInitializer
 {
-	int rslen, haplen, *q, *i, *d, *c;
-	char *hap, *rs;
-} testcase;
-
-int normalize(char c)
-{
-	return ((int) (c - 33));
-}
-
-int read_testcase(testcase *tc)
-{
-	char *q, *i, *d, *c, *line = NULL;
-	int _q, _i, _d, _c;
-	int x, size = 0;
-	ssize_t read;
-
-	read = getline(&line, (size_t *) &size, stdin);
-	if (read == -1)
-		return -1;
-
-	tc->hap = (char *) malloc(size);
-	tc->rs = (char *) malloc(size);
-	q = (char *) malloc(size);
-	i = (char *) malloc(size);
-	d = (char *) malloc(size);
-	c = (char *) malloc(size);
-
-	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
-		return -1;
-
-	tc->haplen = strlen(tc->hap);
-	tc->rslen = strlen(tc->rs);
-	tc->q = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->i = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->d = (int *) malloc(sizeof(int) * tc->rslen);
-	tc->c = (int *) malloc(sizeof(int) * tc->rslen);
-
-	for (x = 0; x < tc->rslen; x++)
-	{
-		_q = normalize(q[x]);
-		_i = normalize(i[x]);
-		_d = normalize(d[x]);
-		_c = normalize(c[x]);
-		tc->q[x] = (_q < 6) ? 6 : _q;
-		tc->i[x] = _i;
-		tc->d[x] = _d;
-		tc->c[x] = _c;
-	}
-
-	free(q);
-	free(i);
-	free(d);
-	free(c);
-	free(line);
-
-	return 0;
-}
-
-template<class T>
-struct Context{};
-
-template<>
-struct Context<double>
-{
-	Context()
-	{
-		for (int x = 0; x < 128; x++)
-			ph2pr[x] = pow(10.0, -((double)x) / 10.0);
-
-		INITIAL_CONSTANT = ldexp(1.0, 1020.0);
-		LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
-		RESULT_THRESHOLD = 0.0;
-	}
-
-	double LOG10(double v){ return log10(v); }
-
-	static double _(double n){ return n; }
-	static double _(float n){ return ((double) n); }
-	double ph2pr[128];
-	double INITIAL_CONSTANT;
-	double LOG10_INITIAL_CONSTANT;
-	double RESULT_THRESHOLD;
+  public:
+    LoadTimeInitializer()		//will be called when library is loaded
+    {
+      ConvertChar::init();
+    }
 };
+LoadTimeInitializer g_load_time_initializer;
 
-template<>
-struct Context<float>
-{
-	Context()
-	{
-		for (int x = 0; x < 128; x++)
-			ph2pr[x] = powf(10.f, -((float)x) / 10.f);
-
-		INITIAL_CONSTANT = ldexpf(1.f, 120.f);
-		LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
-		RESULT_THRESHOLD = ldexpf(1.f, -110.f);
-	}
-
-	float LOG10(float v){ return log10f(v); }
-
-	static float _(double n){ return ((float) n); }
-	static float _(float n){ return n; }
-	float ph2pr[128];
-	float INITIAL_CONSTANT;
-	float LOG10_INITIAL_CONSTANT;
-	float RESULT_THRESHOLD;
-};
 
 template<class NUMBER>
 NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
@@ -139,10 +46,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
 
 	Context<NUMBER> ctx;
 
-	NUMBER M[ROWS][COLS];
-	NUMBER X[ROWS][COLS];
-	NUMBER Y[ROWS][COLS];
-	NUMBER p[ROWS][6];
+	NUMBER M[MROWS][MCOLS];
+	NUMBER X[MROWS][MCOLS];
+	NUMBER Y[MROWS][MCOLS];
+	NUMBER p[MROWS][6];
 
 	p[0][MM] = ctx._(0.0);
 	p[0][GapM] = ctx._(0.0);
@@ -159,8 +66,10 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
 		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
 		p[r][MX] = ctx.ph2pr[_i];
 		p[r][XX] = ctx.ph2pr[_c];
-		p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-		p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+		p[r][MY] = ctx.ph2pr[_d];
+		p[r][YY] = ctx.ph2pr[_c];
+		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
 	}
 
 	for (c = 0; c < COLS; c++)
@@ -188,6 +97,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
 			NUMBER distm = ctx.ph2pr[_q];
 			if (_rs == _hap || _rs == 'N' || _hap == 'N')
 				distm = ctx._(1.0) - distm;
+			else
+			  distm = distm/3;
 			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
 			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
 			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
@@ -201,76 +112,127 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
 	if (before_last_log != NULL)
 		*before_last_log = result;
 
-	return result; //ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
+	return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
 }
 
 #define BATCH_SIZE  10000
 #define RUN_HYBRID
 
-int main()
+int main(int argc, char** argv)
 {
-        testcase tc[BATCH_SIZE];
-        float result[BATCH_SIZE], result_avxf;
-        double result_avxd;
-        struct timeval start, end;
-        long long aggregateTimeRead = 0L;
-        long long aggregateTimeCompute = 0L;
-        long long aggregateTimeWrite = 0L;
+  if(argc < 2)
+  {
+    cerr << "Needs path to input file as argument\n";
+    exit(0);
+  }
+  bool use_old_read_testcase = false;
+  if(argc >= 3 && string(argv[2]) == "1")
+    use_old_read_testcase = true;
 
-        bool noMoreData = false;
-        int count =0;
-        while (!noMoreData)
-        {
-                int read_count = BATCH_SIZE;
-                gettimeofday(&start, NULL);
-                for (int b=0;b<BATCH_SIZE;b++)
-                        if (read_testcase(&tc[b])==-1)
-                        {
-                                read_count = b;
-                                noMoreData = true;
-                                break;
-                        }
-                gettimeofday(&end, NULL);
-                aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+  testcase tc;
+  if(use_old_read_testcase)
+  {
+    FILE* fptr = fopen(argv[1],"r");
+    while(!feof(fptr))
+    {
+      if(read_testcase(&tc, fptr) >= 0)
+      {
+	double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+	double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
 
-                gettimeofday(&start, NULL);
-                for (int b=0;b<read_count;b++)
-                {
-                        result_avxf = compute_full_prob<float>(&tc[b]);
+	cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
+	delete tc.rs;
+	delete tc.hap;
+      }
+    }
+    fclose(fptr);
+  }
+  else
+  {
+    std::ifstream ifptr;
+    std::vector<std::string> tokens;
+    ifptr.open(argv[1]);
+    assert(ifptr.is_open());
+    while(1)
+    {
+      tokens.clear();
+      if(read_mod_testcase(ifptr, &tc, false) < 0)
+	break;
+      //double result = 0;
+      double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+      double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
 
-                        #ifdef RUN_HYBRID
-                                #define MIN_ACCEPTED 1e-28f
-                                if (result_avxf < MIN_ACCEPTED) {
-                                      count++;
-                                      result_avxd = compute_full_prob<double>(&tc[b]);
-                                      result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
-                                }
-                                else
-                                      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-                        #endif
+      cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
+      delete tc.rs;
+      delete tc.hap;
+    }
+    ifptr.close();
+  }
+  return 0;  
 
-                        #ifndef RUN_HYBRID
-                                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-                        #endif
+#if 0
+  float result[BATCH_SIZE], result_avxf;
+  double result_avxd;
+  struct timeval start, end;
+  long long aggregateTimeRead = 0L;
+  long long aggregateTimeCompute = 0L;
+  long long aggregateTimeWrite = 0L;
 
-                }
-                gettimeofday(&end, NULL);
-                aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+  bool noMoreData = false;
+  int count =0;
+  while (!noMoreData)
+  {
+    int read_count = BATCH_SIZE;
+    gettimeofday(&start, NULL);
+    for (int b=0;b<BATCH_SIZE;b++)
+      if (read_testcase(&tc[b], NULL)==-1)
+      {
+	read_count = b;
+	noMoreData = true;
+	break;
+      }
+    gettimeofday(&end, NULL);
+    aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
 
-                gettimeofday(&start, NULL);
-                for (int b=0;b<read_count;b++)
-                        printf("%E\n", result[b]);
-                gettimeofday(&end, NULL);
-                aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+    gettimeofday(&start, NULL);
+    for (int b=0;b<read_count;b++)
+    {
+      result_avxf = compute_full_prob<float>(&tc[b]);
 
-        }
+#ifdef RUN_HYBRID
+#define MIN_ACCEPTED 1e-28f
+      if (result_avxf < MIN_ACCEPTED) {
+	count++;
+	result_avxd = compute_full_prob<double>(&tc[b]);
+	result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+      }
+      else
+	result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif
 
-        printf("AVX Read Time: %ld\n", aggregateTimeRead);
-        printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
-        printf("AVX Write Time: %ld\n", aggregateTimeWrite);
-        printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
-        printf("# Double called: %d\n", count);
+#ifndef RUN_HYBRID
+      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif
 
-        return 0;
+    }
+    gettimeofday(&end, NULL);
+    aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+    gettimeofday(&start, NULL);
+    for (int b=0;b<read_count;b++)
+      printf("%E\n", result[b]);
+    gettimeofday(&end, NULL);
+    aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+  }
+
+  printf("AVX Read Time: %ld\n", aggregateTimeRead);
+  printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
+  printf("AVX Write Time: %ld\n", aggregateTimeWrite);
+  printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+  printf("# Double called: %d\n", count);
+
+  return 0;
+#endif
 }
 
diff --git a/pairhmm-template-main.cc b/pairhmm-template-main.cc
index 87d50eb5b..9aeb0dddd 100644
--- a/pairhmm-template-main.cc
+++ b/pairhmm-template-main.cc
@@ -25,9 +25,11 @@ double getCurrClk() {
   return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
 }
 
+
 int main()
 {
-        testcase tc[BATCH_SIZE];
+        
+	testcase tc[BATCH_SIZE];
         float result[BATCH_SIZE], result_avxf;
         double result_avxd;
         //struct timeval start, end;
@@ -38,7 +40,7 @@ int main()
 
 	// Need to call it once to initialize the static array
 	ConvertChar::init() ;
-	
+
 
 	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
 	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
@@ -53,7 +55,7 @@ int main()
  
 		lastClk = getCurrClk() ;
                 for (int b=0;b<BATCH_SIZE;b++)
-                        if (read_testcase(&tc[b])==-1)
+                        if (read_testcase(&tc[b], NULL)==-1)
                         {
                                 read_count = b;
                                 noMoreData = true;
diff --git a/template.h b/template.h
index 4c8a4b5e3..e70784073 100644
--- a/template.h
+++ b/template.h
@@ -14,6 +14,11 @@
 
 #include <ctype.h>
 #include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
 
 #define MROWS  500
 #define MCOLS  1000
@@ -127,18 +132,20 @@ struct Context<float>
 
 typedef struct
 {
-        int rslen, haplen, *q, *i, *d, *c;
+        int rslen, haplen;
+	/*int *q, *i, *d, *c;*/
+	int q[MROWS], i[MROWS], d[MROWS], c[MROWS];
         char *hap, *rs;
-        int *ihap;
-        int *irs;
+	int *ihap;
+	int *irs;
 } testcase;
 
 
 template<class T>
 std::string to_string(T obj)
 {
-  stringstream ss;
-  string ret_string;
+  std::stringstream ss;
+  std::string ret_string;
   ss.clear();
   ss << std::scientific << obj;
   ss >> ret_string;
@@ -148,66 +155,8 @@ std::string to_string(T obj)
 void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
 
 int normalize(char c);
-
-
-int read_testcase(testcase *tc)
-{
-        char *q, *i, *d, *c, *line = NULL;
-        int _q, _i, _d, _c;
-        int x, size = 0;
-        ssize_t read;
-
-        read = getline(&line, (size_t *) &size, stdin);
-        if (read == -1)
-                return -1;
-
-        tc->hap = (char *) malloc(size);
-        tc->rs = (char *) malloc(size);
-        q = (char *) malloc(size);
-        i = (char *) malloc(size);
-        d = (char *) malloc(size);
-        c = (char *) malloc(size);
-        tc->ihap = (int *) malloc(size*sizeof(int));
-        tc->irs = (int *) malloc(size*sizeof(int));
-
-
-        if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
-                return -1;
-
-        tc->haplen = strlen(tc->hap);
-        tc->rslen = strlen(tc->rs);
-        tc->q = (int *) malloc(sizeof(int) * tc->rslen);
-        tc->i = (int *) malloc(sizeof(int) * tc->rslen);
-        tc->d = (int *) malloc(sizeof(int) * tc->rslen);
-        tc->c = (int *) malloc(sizeof(int) * tc->rslen);
-
-        tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
-        tc->irs = (int *) malloc(tc->rslen*sizeof(int));
-
-        for (x = 0; x < tc->rslen; x++)
-        {
-                _q = normalize(q[x]);
-                _i = normalize(i[x]);
-                _d = normalize(d[x]);
-                _c = normalize(c[x]);
-                tc->q[x] = (_q < 6) ? 6 : _q;
-                tc->i[x] = _i;
-                tc->d[x] = _d;
-                tc->c[x] = _c;
-                tc->irs[x] = tc->rs[x];
-        }
-
-        for (x = 0; x < tc->haplen; x++)
-                tc->ihap[x] = tc->hap[x];
-
-        free(q);
-        free(i);
-        free(d);
-        free(c);
-        free(line);
-
-        return 0;
-}
+int read_testcase(testcase *tc, FILE* ifp);
+int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
 
 #define NUM_DISTINCT_CHARS 5
 #define AMBIG_CHAR 4

From 90938b86100dcfa8c63f26ac7b0c6ca6ec0f0dad Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Thu, 16 Jan 2014 19:58:04 -0800
Subject: [PATCH 4/6] Minor typo in comments fixed

---
 ...e_sting_utils_pairhmm_JNILoglessPairHMM.cc | 78 +------------------
 1 file changed, 1 insertion(+), 77 deletions(-)

diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
index b8298161e..852caa00d 100644
--- a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@@ -316,7 +316,7 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
   jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy);
   assert(likelihoodDoubleArray && "likelihoodArray is NULL");
   assert(env->GetArrayLength(likelihoodArray) == numTestCases);
-#pragma omp parallel for schedule (guided,2) private(tc_idx) num_threads(maxNumThreadsToUse) 
+#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) 
   for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
   {
     double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
@@ -347,79 +347,3 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
   tc_array.clear();
 }
 
-
-
-
-    /*
-    protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
-	    final byte[] readBases,
-	    final byte[] readQuals,
-	    final byte[] insertionGOP,
-	    final byte[] deletionGOP,
-	    final byte[] overallGCP,
-	    final boolean recacheReadValues,
-	    final byte[] nextHaploytpeBases) {
-
-	paddedReadLength = readBases.length + 1;
-	paddedHaplotypeLength = haplotypeBases.length + 1;
-	double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, true, 0);
-	if ( ! MathUtils.goodLog10Probability(result) )
-	    throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result));
-	// Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of).
-	// Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper.
-	// KG: seems pointless is not being used anywhere
-	previousHaplotypeBases = haplotypeBases;
-	return result;
-	return 0;
-    }*/
-
-//public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap)
-    //{
-	////PerReadAlleleLikelihoodMap retValue = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
-	//// (re)initialize the pairHMM only if necessary
-	//final int readMaxLength = findMaxReadLength(reads);
-	//final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
-	//if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
-	//{ initialize(readMaxLength, haplotypeMaxLength); }
-
-	//if ( ! initialized )
-	    //throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods");
-	//final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
-	//jniComputeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap, likelihoodMap);
-	//for(GATKSAMRecord read : reads){
-	    //final byte[] readBases = read.getReadBases();
-	    //final byte[] readQuals = read.getBaseQualities();
-	    //final byte[] readInsQuals = read.getBaseInsertionQualities();
-	    //final byte[] readDelQuals = read.getBaseDeletionQualities();
-	    //final byte[] overallGCP = GCPArrayMap.get(read);
-
-	    //// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
-	    //byte[] currentHaplotypeBases = null;
-	    //boolean isFirstHaplotype = true;
-	    //Allele currentAllele = null;
-	    //double log10l;
-	    //for (final Allele allele : alleleHaplotypeMap.keySet()){
-		//final Haplotype haplotype = alleleHaplotypeMap.get(allele);
-		//final byte[] nextHaplotypeBases = haplotype.getBases();
-		//if (currentHaplotypeBases != null) {
-		    //log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
-			    //readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
-		    //likelihoodMap.add(read, currentAllele, log10l);
-		//}
-		//// update the current haplotype
-		//currentHaplotypeBases = nextHaplotypeBases;
-		//currentAllele = allele;
-	    //}
-	    //// process the final haplotype
-	    //if (currentHaplotypeBases != null) {
-
-		//// there is no next haplotype, so pass null for nextHaplotypeBases.
-		//log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
-			//readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
-		//likelihoodMap.add(read, currentAllele, log10l);
-	    //}
-	//}
-	//if(debug)
-	    //debugClose();
-	//return likelihoodMap;
-    //}

From 532485ca599cc38fdf9c92a61be51a080f268c18 Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Thu, 16 Jan 2014 20:26:41 -0800
Subject: [PATCH 5/6] Removed unnecessary files

---
 CHEATSHEET |  2 --
 README     | 41 -----------------------------------------
 2 files changed, 43 deletions(-)
 delete mode 100644 CHEATSHEET
 delete mode 100644 README

diff --git a/CHEATSHEET b/CHEATSHEET
deleted file mode 100644
index f61bfdd7f..000000000
--- a/CHEATSHEET
+++ /dev/null
@@ -1,2 +0,0 @@
-/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX; ./pairhmm-template-main.o < mediumTest.in > med.out; head -n -5 med.out > medium.out; lua checkLikelihoods.lua medium.out ~/hmm/mediumTest.out 6e-5
-
diff --git a/README b/README
deleted file mode 100644
index e16df5a8d..000000000
--- a/README
+++ /dev/null
@@ -1,41 +0,0 @@
-How to compile:
-/opt/intel/bin/icc pairhmm-template-main.cc -o pairhmm-template-main.o -O3 -W -Wall -lm -xAVX
-
-------------------------------------
-How to run:
-./pairhmm-template-main.o < ./test_data/largeTest.in > large.out
-./pairhmm-template-main.o < ./test_data/mediumTest.in > medium.out
--------------------------------------
-Results:
--------------------------------------
-MEDIUM FLOAT:
-AVX Read Time: 8019020
-AVX Compute Time: 29270238
-AVX Write Time: 242170
-AVX Total Time: 37531428
-# Double called: 0
--------------------------------------
-MEDIUM HYBRID:
-AVX Read Time: 7596629
-AVX Compute Time: 29979220
-AVX Write Time: 248485
-AVX Total Time: 37824334
-# Double called: 20329
--------------------------------------
-LARGE FLOAT:
-AVX Read Time: 111888586
-AVX Compute Time: 408131806
-AVX Write Time: 3477892
-AVX Total Time: 523498284
-# Double called: 0
--------------------------------------
-LARGE HYBRID:
-AVX Read Time: 109728994
-AVX Compute Time: 419475317
-AVX Write Time: 3575364
-AVX Total Time: 532779675
-# Double called: 310042
-
-
-
-

From e6c6f8e313525e833c65b913d0fd5031cb855f19 Mon Sep 17 00:00:00 2001
From: Karthik Gururaj <karthik.gururaj@intel.com>
Date: Thu, 16 Jan 2014 20:28:50 -0800
Subject: [PATCH 6/6] Renamed directory

---
 .gitignore => PairHMM_JNI/.gitignore                              | 0
 Makefile => PairHMM_JNI/Makefile                                  | 0
 convert_char.cc => PairHMM_JNI/convert_char.cc                    | 0
 define-double.h => PairHMM_JNI/define-double.h                    | 0
 define-float.h => PairHMM_JNI/define-float.h                      | 0
 hmm_mask.cc => PairHMM_JNI/hmm_mask.cc                            | 0
 .../org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc   | 0
 .../org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h    | 0
 pairhmm-1-base.cc => PairHMM_JNI/pairhmm-1-base.cc                | 0
 .../pairhmm-template-kernel.cc                                    | 0
 pairhmm-template-main.cc => PairHMM_JNI/pairhmm-template-main.cc  | 0
 shift_template.c => PairHMM_JNI/shift_template.c                  | 0
 template.h => PairHMM_JNI/template.h                              | 0
 13 files changed, 0 insertions(+), 0 deletions(-)
 rename .gitignore => PairHMM_JNI/.gitignore (100%)
 rename Makefile => PairHMM_JNI/Makefile (100%)
 rename convert_char.cc => PairHMM_JNI/convert_char.cc (100%)
 rename define-double.h => PairHMM_JNI/define-double.h (100%)
 rename define-float.h => PairHMM_JNI/define-float.h (100%)
 rename hmm_mask.cc => PairHMM_JNI/hmm_mask.cc (100%)
 rename org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc => PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc (100%)
 rename org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h => PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h (100%)
 rename pairhmm-1-base.cc => PairHMM_JNI/pairhmm-1-base.cc (100%)
 rename pairhmm-template-kernel.cc => PairHMM_JNI/pairhmm-template-kernel.cc (100%)
 rename pairhmm-template-main.cc => PairHMM_JNI/pairhmm-template-main.cc (100%)
 rename shift_template.c => PairHMM_JNI/shift_template.c (100%)
 rename template.h => PairHMM_JNI/template.h (100%)

diff --git a/.gitignore b/PairHMM_JNI/.gitignore
similarity index 100%
rename from .gitignore
rename to PairHMM_JNI/.gitignore
diff --git a/Makefile b/PairHMM_JNI/Makefile
similarity index 100%
rename from Makefile
rename to PairHMM_JNI/Makefile
diff --git a/convert_char.cc b/PairHMM_JNI/convert_char.cc
similarity index 100%
rename from convert_char.cc
rename to PairHMM_JNI/convert_char.cc
diff --git a/define-double.h b/PairHMM_JNI/define-double.h
similarity index 100%
rename from define-double.h
rename to PairHMM_JNI/define-double.h
diff --git a/define-float.h b/PairHMM_JNI/define-float.h
similarity index 100%
rename from define-float.h
rename to PairHMM_JNI/define-float.h
diff --git a/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc
similarity index 100%
rename from hmm_mask.cc
rename to PairHMM_JNI/hmm_mask.cc
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
similarity index 100%
rename from org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
rename to PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
diff --git a/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
similarity index 100%
rename from org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
rename to PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
diff --git a/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc
similarity index 100%
rename from pairhmm-1-base.cc
rename to PairHMM_JNI/pairhmm-1-base.cc
diff --git a/pairhmm-template-kernel.cc b/PairHMM_JNI/pairhmm-template-kernel.cc
similarity index 100%
rename from pairhmm-template-kernel.cc
rename to PairHMM_JNI/pairhmm-template-kernel.cc
diff --git a/pairhmm-template-main.cc b/PairHMM_JNI/pairhmm-template-main.cc
similarity index 100%
rename from pairhmm-template-main.cc
rename to PairHMM_JNI/pairhmm-template-main.cc
diff --git a/shift_template.c b/PairHMM_JNI/shift_template.c
similarity index 100%
rename from shift_template.c
rename to PairHMM_JNI/shift_template.c
diff --git a/template.h b/PairHMM_JNI/template.h
similarity index 100%
rename from template.h
rename to PairHMM_JNI/template.h