Merge branch 'master' of /home/karthikg/broad/archive/hmm_intra into intel_pairhmm

2014-01-16 20:29:51 -08:00 · 2014-01-16 20:29:51 -08:00 · b57de8eec1
parent e8a5022777 e6c6f8e313
commit b57de8eec1
13 changed files with 2465 additions and 0 deletions
--- a/PairHMM_JNI/.gitignore
+++ b/PairHMM_JNI/.gitignore
@ -0,0 +1,10 @@
+.svn
+*.o
+*.so
+tests
+.deps
+hmm_Mohammad
+pairhmm-template-main
+*.swp
+checker
+reformat
--- a/PairHMM_JNI/Makefile
+++ b/PairHMM_JNI/Makefile
@ -0,0 +1,47 @@
+OMPCFLAGS=-fopenmp
+#OMPLDFLAGS=-lgomp
+
+#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q            -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+#CFLAGS=-O2             -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+
+JAVA_ROOT=/opt/jdk1.7.0_25/
+JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux
+
+CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
+
+CXXFLAGS=$(CFLAGS)
+CC=icc
+CXX=icc
+
+LDFLAGS=-lm $(OMPLDFLAGS)
+
+#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
+BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker
+
+#SOURCES=pairhmm-1-base.cc input.cc 
+LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc
+SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
+LIBOBJECTS=$(LIBSOURCES:.cc=.o)
+DEPDIR=.deps
+DF=$(DEPDIR)/$(*).d
+
+all: $(BIN)
+
+-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
+
+checker: pairhmm-1-base.o convert_char.o
+	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
+
+pairhmm-template-main:	pairhmm-template-main.o convert_char.o
+	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
+
+libJNILoglessPairHMM.so: $(LIBOBJECTS) 
+	$(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) 
+
+%.o: %.cc
+	@mkdir -p $(DEPDIR)
+	$(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+
+
+clean:
+	rm -rf $(BIN) *.o $(DEPDIR)
--- a/PairHMM_JNI/convert_char.cc
+++ b/PairHMM_JNI/convert_char.cc
@ -0,0 +1,186 @@
+#include "template.h"
+uint8_t ConvertChar::conversionTable[255];
+using namespace std;
+
+int normalize(char c)
+{
+	return ((int) (c - 33));
+}
+
+int read_testcase(testcase *tc, FILE* ifp)
+{
+	char *q, *i, *d, *c, *line = NULL;
+	int _q, _i, _d, _c;
+	int x, size = 0;
+	ssize_t read;
+
+	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
+	if (read == -1)
+		return -1;
+
+
+	tc->hap = (char *) malloc(size);
+	tc->rs = (char *) malloc(size);
+	q = (char *) malloc(size);
+	i = (char *) malloc(size);
+	d = (char *) malloc(size);
+	c = (char *) malloc(size);
+
+	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+		return -1;
+
+	tc->haplen = strlen(tc->hap);
+	tc->rslen = strlen(tc->rs);
+	assert(tc->rslen < MROWS);
+	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
+	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
+
+	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+	for (x = 0; x < tc->rslen; x++)
+	{
+		_q = normalize(q[x]);
+		_i = normalize(i[x]);
+		_d = normalize(d[x]);
+		_c = normalize(c[x]);
+		tc->q[x] = (_q < 6) ? 6 : _q;
+		tc->i[x] = _i;
+		tc->d[x] = _d;
+		tc->c[x] = _c;
+		tc->irs[x] = tc->rs[x];
+	}
+	for (x = 0; x < tc->haplen; x++)
+	  tc->ihap[x] = tc->hap[x];
+
+
+	free(q);
+	free(i);
+	free(d);
+	free(c);
+	free(line);
+
+	return 0;
+}
+
+unsigned MAX_LINE_LENGTH = 65536;
+int convToInt(std::string s)
+{
+  int i;
+  std::istringstream strin(s);
+  strin >> i;
+  return i;
+}
+
+void tokenize(std::ifstream& fptr, std::vector<std::string>& tokens)
+{
+  int i = 0;
+  std::string tmp;
+  std::vector<std::string> myVec;
+  vector<char> line;
+  line.clear();
+  line.resize(MAX_LINE_LENGTH);
+  vector<char> tmpline;
+  tmpline.clear();
+  tmpline.resize(MAX_LINE_LENGTH);
+  myVec.clear();
+
+  while(!fptr.eof())
+  {
+    i = 0;
+    bool still_read_line = true;
+    unsigned line_position = 0;
+    while(still_read_line)
+    {
+      fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH);
+      if(line_position + MAX_LINE_LENGTH > line.size())
+	line.resize(2*line.size());
+      for(unsigned i=0;i<MAX_LINE_LENGTH && tmpline[i] != '\0';++i,++line_position)
+	line[line_position] = tmpline[i];
+      if(fptr.eof() || !fptr.fail()) 
+      {
+	still_read_line = false;
+	line[line_position++] = '\0';
+      }
+    }
+    std::istringstream kap(&(line[0]));
+
+    while(!kap.eof())
+    {
+      kap >> std::skipws >> tmp;
+      if(tmp != "")
+      {
+	myVec.push_back(tmp);
+	++i;
+	//std::cout <<tmp <<"#";
+      }
+      tmp = "";
+    }
+    //std::cout << "\n";
+    if(myVec.size() > 0)
+      break;
+  }
+  tokens.clear();
+  //std::cout << "Why "<<myVec.size()<<"\n";
+  tokens.resize(myVec.size());
+  for(i=0;i<(int)myVec.size();++i)
+    tokens[i] = myVec[i];
+  line.clear();
+  tmpline.clear();
+}
+
+int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
+{
+  static bool first_call = true;
+  vector<string> tokens;
+  tokens.clear();
+  tokenize(fptr, tokens);
+  if(tokens.size() == 0)
+    return -1;
+  tc->hap = new char[tokens[0].size()+2];
+  tc->haplen = tokens[0].size();
+  memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
+  tc->rs = new char[tokens[1].size()+2];
+  tc->rslen = tokens[1].size();
+  //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
+  memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
+  assert(tokens.size() == 2 + 4*(tc->rslen));
+  assert(tc->rslen < MROWS);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]);
+  for(unsigned j=0;j<tc->rslen;++j)
+    tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]);
+ 
+  if(reformat)
+  {
+    ofstream ofptr;
+    ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app);
+    assert(ofptr.is_open());
+    ofptr << tokens[0] << " ";
+    ofptr << tokens[1] << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->q[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->i[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->d[j]+33));
+    ofptr << " ";
+    for(unsigned j=0;j<tc->rslen;++j)
+      ofptr << ((char)(tc->c[j]+33));
+    ofptr << " 0 false\n";
+
+    ofptr.close();
+    first_call = false;
+  }
+
+
+  return tokens.size();
+}
--- a/PairHMM_JNI/define-double.h
+++ b/PairHMM_JNI/define-double.h
@ -0,0 +1,158 @@
+#include <iostream>
+
+#ifdef PRECISION
+        #undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+	#undef _256_TYPE
+	#undef AVX_LENGTH
+	#undef MAVX_COUNT
+	#undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+        #undef SET_VEC_ZERO
+        #undef VEC_OR
+        #undef VEC_ADD
+	#undef VEC_SUB
+        #undef VEC_MUL
+        #undef VEC_DIV
+        #undef VEC_BLEND
+        #undef VEC_BLENDV
+	#undef VEC_CAST_256_128
+	#undef VEC_EXTRACT_128
+	#undef VEC_EXTRACT_UNIT
+	#undef VEC_SET1_VAL128
+	#undef VEC_MOVE
+	#undef VEC_CAST_128_256
+	#undef VEC_INSERT_VAL
+	#undef VEC_CVT_128_256
+	#undef VEC_SET1_VAL
+	#undef VEC_POPCVT_CHAR
+	#undef VEC_LDPOPCVT_CHAR
+	#undef VEC_CMP_EQ
+	#undef VEC_SET_LSE
+	#undef SHIFT_HAP
+	#undef print256b
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX
+        #undef VEC_SHIFT_LEFT_1BIT
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS
+        #undef _256_INT_TYPE
+#endif
+
+#define PRECISION d
+#define MAIN_TYPE double
+#define MAIN_TYPE_SIZE 64
+#define UNION_TYPE mix_D
+#define IF_128 IF_128d
+#define IF_MAIN_TYPE IF_64
+#define SHIFT_CONST1 8
+#define SHIFT_CONST2 1
+#define SHIFT_CONST3 8
+#define _128_TYPE __m128d
+#define _256_TYPE __m256d
+#define _256_INT_TYPE __m256i
+#define AVX_LENGTH 4
+#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
+#define HAP_TYPE __m128i
+#define MASK_TYPE uint64_t
+#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF
+#define MASK_VEC MaskVec_D
+
+#define SET_VEC_ZERO(__vec)                     \
+  __vec= _mm256_setzero_pd()
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm256_or_pd(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm256_add_pd(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm256_sub_pd(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm256_mul_pd(__v1, __v2)
+
+#define VEC_DIV(__v1, __v2)                     \
+  _mm256_div_pd(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm256_blend_pd(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm256_blendv_pd(__v1, __v2, __maskV)
+
+#define VEC_CAST_256_128(__v1)			\
+  _mm256_castpd256_pd128 (__v1)
+
+#define VEC_EXTRACT_128(__v1, __im)		\
+  _mm256_extractf128_pd (__v1, __im)
+
+#define VEC_EXTRACT_UNIT(__v1, __im)		\
+  _mm_extract_epi64(__v1, __im)
+
+#define VEC_SET1_VAL128(__val)	 		\
+  _mm_set1_pd(__val)
+
+#define VEC_MOVE(__v1, __val)			\
+  _mm_move_sd(__v1, __val)
+
+#define VEC_CAST_128_256(__v1)			\
+  _mm256_castpd128_pd256(__v1)		
+
+#define VEC_INSERT_VAL(__v1, __val, __pos)	\
+  _mm256_insertf128_pd(__v1, __val, __pos)
+
+#define VEC_CVT_128_256(__v1)			\
+  _mm256_cvtepi32_pd(__v1)
+
+#define VEC_SET1_VAL(__val)			\
+  _mm256_set1_pd(__val)
+
+#define VEC_POPCVT_CHAR(__ch)			\
+  _mm256_cvtepi32_pd(_mm_set1_epi32(__ch))
+
+#define VEC_LDPOPCVT_CHAR(__addr)               \
+  _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr))
+
+#define VEC_CMP_EQ(__v1, __v2)			\
+  _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ)
+
+#define VEC_SET_LSE(__val)                      \
+  _mm256_set_pd(zero, zero, zero, __val);
+
+#define SHIFT_HAP(__v1, __val)			\
+  __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
+
+#define print256b(__v1)				\
+  print256bDP(__v1)
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castpd128_pd256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ;
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi64(__vs, 1) 
+
+
+#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
+    double* ptr1 = (double*) (&__v1) ;					\
+    double* ptr2 = (double*) (&__v2) ;					\
+    for (int ei=__first; ei <= __last; ++ei) {				\
+      if (ptr1[ei] != ptr2[ei]) {					\
+	std::cout << "Double Mismatch at " << ei << ": "		\
+		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
+	exit(0) ;							\
+      }									\
+    }									\
+  }
--- a/PairHMM_JNI/define-float.h
+++ b/PairHMM_JNI/define-float.h
@ -0,0 +1,159 @@
+#include <iostream>
+
+#ifdef PRECISION
+	#undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+        #undef _256_TYPE
+        #undef AVX_LENGTH
+        #undef MAVX_COUNT
+	#undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+	#undef SET_VEC_ZERO(__vec)
+	#undef VEC_OR(__v1, __v2)
+	#undef VEC_ADD(__v1, __v2)
+        #undef VEC_SUB(__v1, __v2)
+	#undef VEC_MUL(__v1, __v2)
+	#undef VEC_DIV(__v1, __v2)
+	#undef VEC_BLEND(__v1, __v2, __mask)
+	#undef VEC_BLENDV(__v1, __v2, __maskV)
+        #undef VEC_CAST_256_128(__v1)
+        #undef VEC_EXTRACT_128(__v1, __im)
+        #undef VEC_EXTRACT_UNIT(__v1, __im)
+        #undef VEC_SET1_VAL128(__val)
+        #undef VEC_MOVE(__v1, __val)
+        #undef VEC_CAST_128_256(__v1)
+        #undef VEC_INSERT_VAL(__v1, __val, __pos)
+	#undef VEC_CVT_128_256(__v1)	
+	#undef VEC_SET1_VAL(__val)
+	#undef VEC_POPCVT_CHAR(__ch)
+	#undef VEC_LDPOPCVT_CHAR(__addr)
+	#undef VEC_CMP_EQ(__v1, __v2)
+	#undef VEC_SET_LSE(__val)
+	#undef SHIFT_HAP(__v1, __val)
+	#undef print256b(__v1)
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+        #undef VEC_SHIFT_LEFT_1BIT(__vs)
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS(__v1, __v2)
+        #undef _256_INT_TYPE
+#endif
+
+#define PRECISION s
+
+#define MAIN_TYPE float
+#define MAIN_TYPE_SIZE 32
+#define UNION_TYPE mix_F
+#define IF_128 IF_128f
+#define IF_MAIN_TYPE IF_32
+#define SHIFT_CONST1 12
+#define SHIFT_CONST2 3
+#define SHIFT_CONST3 4
+#define _128_TYPE __m128
+#define _256_TYPE __m256
+#define _256_INT_TYPE __m256i
+#define AVX_LENGTH 8
+#define MAVX_COUNT  (MROWS+7)/AVX_LENGTH
+#define HAP_TYPE UNION_TYPE
+#define MASK_TYPE uint32_t
+#define MASK_ALL_ONES 0xFFFFFFFF
+#define MASK_VEC MaskVec_F
+
+#define SET_VEC_ZERO(__vec)                     \
+  __vec= _mm256_setzero_ps()
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm256_or_ps(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm256_add_ps(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm256_sub_ps(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm256_mul_ps(__v1, __v2)
+
+#define VEC_DIV(__v1, __v2)                     \
+  _mm256_div_ps(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm256_blend_ps(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm256_blendv_ps(__v1, __v2, __maskV)
+
+#define VEC_CAST_256_128(__v1)			\
+  _mm256_castps256_ps128 (__v1)
+
+#define VEC_EXTRACT_128(__v1, __im)             \
+  _mm256_extractf128_ps (__v1, __im)
+
+#define VEC_EXTRACT_UNIT(__v1, __im)		\
+  _mm_extract_epi32(__v1, __im)
+
+#define VEC_SET1_VAL128(__val)			\
+  _mm_set1_ps(__val)
+
+#define VEC_MOVE(__v1, __val)			\
+  _mm_move_ss(__v1, __val)
+
+#define VEC_CAST_128_256(__v1)                  \
+  _mm256_castps128_ps256(__v1)
+
+#define VEC_INSERT_VAL(__v1, __val, __pos)      \
+  _mm256_insertf128_ps(__v1, __val, __pos)
+
+#define VEC_CVT_128_256(__v1)                   \
+  _mm256_cvtepi32_ps(__v1.i)
+
+#define VEC_SET1_VAL(__val)                 	\
+  _mm256_set1_ps(__val)
+
+#define VEC_POPCVT_CHAR(__ch)                   \
+  _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch))
+
+#define VEC_LDPOPCVT_CHAR(__addr)		\
+   _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr))
+
+#define VEC_CMP_EQ(__v1, __v2)                  \
+  _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ)
+
+#define VEC_SET_LSE(__val)			\
+  _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
+
+#define SHIFT_HAP(__v1, __val)			\
+  _vector_shift_lasts(__v1, __val.f);
+
+#define print256b(__v1)                         \
+  print256bFP(__v1)
+  
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi32(__vs, 1) 
+
+#define COMPARE_VECS(__v1, __v2, __first, __last) {			\
+    float* ptr1 = (float*) (&__v1) ;					\
+    float* ptr2 = (float*) (&__v2) ;					\
+    for (int ei=__first; ei <= __last; ++ei) {				\
+      if (ptr1[ei] != ptr2[ei]) {					\
+	std::cout << "Float Mismatch at " << ei << ": "			\
+		  << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ;	\
+	exit(0) ;							\
+      }									\
+    }									\
+  }
+
--- a/PairHMM_JNI/hmm_mask.cc
+++ b/PairHMM_JNI/hmm_mask.cc
@ -0,0 +1,451 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX  -openmp -openmp-link static
+
+
+#include <iostream>
+
+#include <malloc.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <smmintrin.h>
+
+
+#include <stdint.h>
+#include <algorithm>
+#include <vector>
+#include <set>
+#include <map>
+#include <memory.h>
+#include "template.h"
+
+using namespace std ;
+
+#define CHECK_MASK_CORRECTNESS
+
+template <class T>
+string getBinaryStr (T val, int numBitsToWrite) {
+  
+  ostringstream oss ;
+  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
+  for (int i=numBitsToWrite-1; i >= 0; --i) {
+    oss << ((val & mask) >> i) ;
+    mask >>= 1 ;
+  }
+  return oss.str() ;
+}
+
+int normalize(char c)
+{
+	return ((int) (c - 33));
+}
+
+
+uint8_t ConvertChar::conversionTable[255] ;
+
+int read_testcase(testcase *tc, FILE* ifp)
+{
+	char *q, *i, *d, *c, *line = NULL;
+	int _q, _i, _d, _c;
+	int x, size = 0;
+	ssize_t read;
+
+	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
+	if (read == -1)
+		return -1;
+
+
+	tc->hap = (char *) malloc(size);
+	tc->rs = (char *) malloc(size);
+	q = (char *) malloc(size);
+	i = (char *) malloc(size);
+	d = (char *) malloc(size);
+	c = (char *) malloc(size);
+
+	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
+		return -1;
+
+	tc->haplen = strlen(tc->hap);
+	tc->rslen = strlen(tc->rs);
+	assert(tc->rslen < MROWS);
+	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
+	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
+
+	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
+	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+
+	for (x = 0; x < tc->rslen; x++)
+	{
+		_q = normalize(q[x]);
+		_i = normalize(i[x]);
+		_d = normalize(d[x]);
+		_c = normalize(c[x]);
+		tc->q[x] = (_q < 6) ? 6 : _q;
+		tc->i[x] = _i;
+		tc->d[x] = _d;
+		tc->c[x] = _c;
+		tc->irs[x] = tc->rs[x];
+	}
+	for (x = 0; x < tc->haplen; x++)
+	  tc->ihap[x] = tc->hap[x];
+
+
+	free(q);
+	free(i);
+	free(d);
+	free(c);
+	free(line);
+
+	return 0;
+}
+
+
+
+
+#define ALIGN __attribute__ ((aligned(32)))
+
+
+typedef union ALIGN { 
+  //__m256i vi; 
+  __m128 vf ;
+  __m128i vi ;
+  __m128d vd; 
+
+  uint8_t b[16];
+  //uint16_t hw[8] ;
+  uint32_t w[4] ;
+  uint64_t dw[2] ;
+  
+  float f[4] ;
+  //double d[2] ;
+
+} v128 ;
+
+typedef union ALIGN { 
+  __m256i vi; 
+  __m256 vf ;
+  __m256d vd; 
+  //__m128i vi128[2] ;
+
+  uint8_t b[32];
+  //uint16_t hw[16] ;
+  uint32_t w[8] ;
+  uint64_t dw[4] ;
+  
+  float f[8] ;
+  double d[4] ;
+
+} v256 ;
+
+
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+#define VEC_ENTRY_CNT 8
+#define VEC_LEN 256
+#define VTYPE vf
+#define VTYPEI vi
+#define VENTRY f
+#define VENTRYI w
+#define MTYPE uint32_t
+#define MASK_ALL_ONES 0xFFFFFFFF
+
+
+#define VECTOR v256
+#define VECTOR_SSE v128
+
+#define SET_VEC_ZERO(__vec)			\
+  __vec= _mm256_setzero_ps()
+
+#define SET_VEC_ONES(__vec)			\
+  __vec = _mm256_set1_epi32(0xFFFFFFFF)
+
+#define VEC_OR(__v1, __v2)			\
+  _mm256_or_ps(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)			\
+  _mm256_add_ps(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)			\
+  _mm256_mul_ps(__v1, __v2) 
+
+#define VEC_BLENDV(__v1, __v2, __maskV)		\
+  _mm256_blendv_ps(__v1, __v2, __maskV)
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
+  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
+  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
+
+#define VECS_SHIFT_LEFT_1BIT(__vs)		\
+  __vs = _mm_slli_epi32(__vs, 1) 
+
+
+#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) {				\
+    uint64_t __mask = ;							\
+    uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \
+    __shiftWord <<= (64-cnt) ;						\
+    __vs = _mm_slri_epi64(__vs, __cnt) ;				\
+    __vs.dw[0] |= __shiftWord ;						\
+  }
+
+
+#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+    MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
+    MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+    __mask >>= __shiftBy ;						\
+    __mask |= __lastShiftOut ;						\
+    __lastShiftOut = __nextShiftOut ;					\
+  }
+
+
+#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+  MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
+  MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
+  __lastShiftOut = __nextShiftOut ;					\ 
+}
+
+
+void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs,
+			  MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
+
+  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
+
+  for (int vi=0; vi < numMaskVecs; ++vi) {
+    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
+      maskArr[vi][rs] = 0 ;
+    }
+    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
+  }
+ 
+  for (int col=1; col < COLS; ++col) {
+    int mIndex = (col-1) / maskBitCnt ;
+    int mOffset = (col-1) % maskBitCnt ;
+    MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ;
+
+    char hapChar = tc.hap[col-1] ;
+
+    if (hapChar == AMBIG_CHAR) {
+      maskArr[mIndex][0] |= bitMask ;
+      maskArr[mIndex][1] |= bitMask ;
+      maskArr[mIndex][2] |= bitMask ;
+      maskArr[mIndex][3] |= bitMask ;
+    } 
+
+    //cout << hapChar << " " << mIndex << " " << getBinaryStr<MTYPE>(bitMask, 32)
+    // << endl ;
+    //cout << getBinaryStr<MTYPE>(maskArr[0][hapChar],32) << endl ;
+    //exit(0) ;
+		     
+    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+
+
+    // bit corresponding to col 1 will be the MSB of the mask 0
+    // bit corresponding to col 2 will be the MSB-1 of the mask 0
+    // ...
+    // bit corresponding to col 32 will be the LSB of the mask 0
+    // bit corresponding to col 33 will be the MSB of the mask 1
+    // ...
+  }
+
+}
+
+
+void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {
+
+  int ROWS = tc.rslen + 1 ;
+  int COLS = tc.haplen + 1 ;
+
+  // only for testing
+  VECTOR mismatchData, matchData ;
+  //SET_VEC_ZERO(mismatchData.VTYPE) ;
+  //SET_VEC_ONES(matchData.VTYPEI) ;
+  for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) {
+    matchData.VENTRY[ei] = 1.0 ;
+    mismatchData.VENTRY[ei] = 0.0 ;
+  }
+  
+  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
+  const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
+
+  MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
+  precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ;
+
+#ifdef DEBUG 
+  if (printDebug) {
+    cout << "The first 32 hap chars are: " ;
+    for (int i=0; i < 32; ++i) {
+      cout << tc.hap[i] ;
+    }
+    cout << endl ;
+
+    cout << "Masks computed for A, C, T, G, N are: "  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][0], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][1], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][2], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][3], 32)  << endl ;
+    cout << getBinaryStr<MTYPE>(maskArr[0][4], 32)  << endl ;
+  }
+#endif // #ifdef DEBUG 
+
+  int beginRowIndex = 1 ;
+  while (beginRowIndex < ROWS) {
+
+    int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ;
+
+    char rsArr[VEC_ENTRY_CNT] ;
+    for (int ri=0; ri < numRowsToProcess; ++ri) {
+      rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
+    }
+
+    // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+    VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3
+    VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7
+
+    MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ;
+    for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) 
+      lastMaskShiftOut[ei] = 0 ;
+
+    int col = 1 ;
+    int diag = 1 ;
+    for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) {
+      // set up the mask vectors for the next maskBitCnt columns
+
+      // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols)
+      for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) {
+	SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], 
+		      lastMaskShiftOut[ei], ei, maskBitCnt) ;
+
+      	int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index
+	SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], 
+		      lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+      }
+
+#ifdef DEBUG 
+      if (printDebug && maskIndex == 0) {
+	cout << "The masks for entry 1: " << endl  
+	     << getBinaryStr<MTYPE>(maskArr[0][rsArr[1]], 32) << endl
+	     << getBinaryStr<MTYPE>(currMaskVecLow.VENTRYI[1], 32) << endl ;
+
+      }
+#endif // #ifdef DEBUG 
+      
+      // iterate over mask bit indices and columns
+      for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) {
+
+	VECTOR maskV ;
+	VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ;
+
+	VECTOR testData ;
+	testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE,
+				    maskV.VTYPE) ;
+
+	VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ;
+	VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ;
+
+#ifdef DEBUG 
+	if (printDebug && maskIndex == 0) {
+	  cout << "The mask for entry 1, mbi=" << mbi << ": "
+	       << getBinaryStr<MTYPE>(maskV.VENTRYI[1], 32) << endl ;
+
+	}
+#endif // #ifdef DEBUG 
+
+#ifdef CHECK_MASK_CORRECTNESS
+
+	int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ;
+	int lastRowIndex = min(col-1, numRowsToProcess-1) ;
+
+	for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) {
+	  int currRow = beginRowIndex + ri ;
+	  int currCol = col - ri + firstRowIndex ;
+
+	  char hapChar = tc.hap[currCol-1] ;
+	  char rsChar = tc.rs[currRow-1] ;
+
+	  bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ;
+
+	  if ((bool) testData.VENTRYI[ri] != match) {
+	    cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag
+		 << " (" << currRow  << ", " << currCol << ")" << endl ;
+
+	    cout << "The chars are: " << hapChar << " and " << rsChar << endl ;
+	    cout << "The selected value is: " << testData.VENTRYI[ri] << endl ;
+
+	    exit(0) ;
+	  }
+	}
+#endif // #ifdef CHECK_MASK_CORRECTNESS
+
+	if (diag < COLS)
+	  ++col ;
+
+      } // mbi
+    } // maskIndex
+
+    beginRowIndex += VEC_ENTRY_CNT ;
+  } // end of stripe
+
+  //cout << "Finished validating entry " << endl ;
+}
+
+#ifdef HMM_MASK_MAIN
+int main () {
+
+  #define BATCH_SIZE 10000
+
+  ConvertChar::init() ;
+  
+  testcase* tcBatch = new testcase[BATCH_SIZE] ;
+  
+  int numBatches = 0 ;
+  int numRead = 0 ;
+
+  const int DEBUG_TC = -1 ;
+
+  FILE* ifp = stdin ;
+
+  do {
+    numRead = 0 ;
+
+    while (numRead < BATCH_SIZE) {
+      if (read_testcase(tcBatch+numRead, ifp) < 0)
+	break ;
+      
+      ++numRead ;
+    }
+
+    if (numRead == 0)
+      break ;
+
+    for (int ti=0; ti < numRead; ++ti) {
+
+      int tcID = numBatches * BATCH_SIZE + ti ;
+      test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ;
+    }
+
+    ++numBatches ;
+  } while (numRead == BATCH_SIZE) ;
+  
+  fclose(ifp) ;
+
+  delete[] tcBatch ;
+
+  return 0 ;
+}
+#endif
--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@ -0,0 +1,349 @@
+#include <jni.h>
+#include <assert.h>
+#include <stdio.h>
+#include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
+#include <vector>
+
+#include <cstdio>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+using namespace std;
+//#define DEBUG 1
+//#define DEBUG0_1 1
+//#define DEBUG3 1
+
+
+#include "template.h"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+
+
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+class LoadTimeInitializer
+{
+  public:
+    LoadTimeInitializer()		//will be called when library is loaded
+    {
+      ConvertChar::init();
+    }
+    jfieldID m_readBasesFID;
+    jfieldID m_readQualsFID;
+    jfieldID m_insertionGOPFID;
+    jfieldID m_deletionGOPFID;
+    jfieldID m_overallGCPFID;
+    jfieldID m_haplotypeBasesFID;
+};
+LoadTimeInitializer g_load_time_initializer;
+
+void debug_dump(string filename, string s, bool to_append, bool add_newline)
+{
+  ofstream fptr;
+  fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
+  fptr << s;
+  if(add_newline)
+    fptr << "\n";
+  fptr.close();
+}
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
+(JNIEnv* env, jclass thisObject,
+ jobjectArray transition, jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP
+ )
+{}
+
+  JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize
+(JNIEnv* env, jobject thisObject,
+ jint readMaxLength, jint haplotypeMaxLength)
+{}
+
+JNIEXPORT jdouble JNICALL 
+Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells( 
+    JNIEnv* env, jobject thisObject,
+    jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength,
+    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
+    jint hapStartIndex
+    )
+
+{ return 0.0; }
+
+#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
+
+#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
+//Gets direct access to Java arrays
+#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
+#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
+#define JNI_RELEASE_MODE JNI_ABORT
+#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
+#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
+
+#else
+//Likely makes copy of Java arrays to JNI C++ space
+#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements
+#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements
+#define JNI_RELEASE_MODE JNI_ABORT
+#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements
+#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements
+
+#endif		//ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
+
+JNIEXPORT jdouble JNICALL 
+Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( 
+    JNIEnv* env, jobject thisObject,
+    jint readLength, jint haplotypeLength,
+    jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals,
+    jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP,
+    jint hapStartIndex
+    )
+{
+  jboolean is_copy = JNI_FALSE;
+  jbyte* readBasesArray =   (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy);
+  jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
+  jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy);
+  jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy);
+  jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy);
+  jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy);
+#ifdef DEBUG
+  assert(readBasesArray && "readBasesArray not initialized in JNI"); 
+  assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+  assert(readQualsArray && "readQualsArray not initialized in JNI"); 
+  assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
+  assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
+  assert(overallGCPArray && "OverallGCP array not initialized in JNI");
+  assert(readLength < MROWS);
+#endif
+  testcase tc;
+  tc.rslen = readLength;
+  tc.haplen = haplotypeLength;
+  tc.rs = (char*)readBasesArray;
+  tc.hap = (char*)haplotypeBasesArray;
+  for(unsigned i=0;i<readLength;++i)
+  {
+    tc.q[i] = (int)readQualsArray[i];
+    tc.i[i] = (int)insertionGOPArray[i];
+    tc.d[i] = (int)deletionGOPArray[i];
+    tc.c[i] = (int)overallGCPArray[i];
+  }
+
+  double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+  double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+#ifdef DEBUG
+  debug_dump("return_values_jni.txt",to_string(result),true);
+#endif
+
+
+  RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE);
+
+  return 0.0;
+}
+
+//Should be called only once for the whole Java process - initializes field ids for the classes JNIReadDataHolderClass
+//and JNIHaplotypeDataHolderClass
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
+  (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass)
+{
+  assert(readDataHolderClass);
+  assert(haplotypeDataHolderClass);
+  jfieldID fid;
+  fid = env->GetFieldID(readDataHolderClass, "readBases", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for readBases");
+  g_load_time_initializer.m_readBasesFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "readQuals", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for readQuals");
+  g_load_time_initializer.m_readQualsFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "insertionGOP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for insertionGOP");
+  g_load_time_initializer.m_insertionGOPFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "deletionGOP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for deletionGOP");
+  g_load_time_initializer.m_deletionGOPFID = fid;
+  fid = env->GetFieldID(readDataHolderClass, "overallGCP", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for overallGCP");
+  g_load_time_initializer.m_overallGCPFID = fid;
+
+  fid = env->GetFieldID(haplotypeDataHolderClass, "haplotypeBases", "[B");
+  assert(fid && "JNI pairHMM: Could not get FID for haplotypeBases");
+  g_load_time_initializer.m_haplotypeBasesFID = fid;
+}
+
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
+  (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, 
+   jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse)
+{
+  jboolean is_copy = JNI_FALSE;
+  //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
+  vector<pair<jbyteArray, jbyte*> > haplotypeBasesArrayVector;
+  haplotypeBasesArrayVector.clear();
+  haplotypeBasesArrayVector.resize(numHaplotypes);
+#ifdef DEBUG0_1
+  cout << "JNI numReads "<<numReads<<" numHaplotypes "<<numHaplotypes<<"\n";
+#endif
+  for(unsigned j=0;j<numHaplotypes;++j)
+  {
+    jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
+    jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
+#ifdef DEBUG
+    assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
+#endif
+    jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
+#ifdef DEBUG
+    assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+    assert(env->GetArrayLength(haplotypeBases) < MCOLS);
+#ifdef DEBUG0_1
+    cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBases)<<"\n";
+#endif
+#endif
+    haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray);
+#ifdef DEBUG3
+    for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
+      debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
+#endif
+  }
+
+  unsigned numTestCases = numReads*numHaplotypes;
+  vector<testcase> tc_array;
+  tc_array.clear();
+  tc_array.resize(numTestCases);
+  unsigned tc_idx = 0;
+  //Store arrays for release later
+  vector<vector<pair<jbyteArray,jbyte*> > > readBasesArrayVector;
+  readBasesArrayVector.clear();
+  readBasesArrayVector.resize(numReads);
+  for(unsigned i=0;i<numReads;++i)
+  {
+    //Get bytearray fields from read
+    jobject readObject = env->GetObjectArrayElement(readDataArray, i);
+    jbyteArray readBases = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readBasesFID);
+    jbyteArray insertionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_insertionGOPFID);
+    jbyteArray deletionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_deletionGOPFID);
+    jbyteArray overallGCP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_overallGCPFID);
+    jbyteArray readQuals = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readQualsFID);
+
+#ifdef DEBUG
+    assert(readBases && ("readBases is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(insertionGOP && ("insertionGOP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(deletionGOP && ("deletionGOP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(overallGCP && ("overallGCP is NULL at index : "+to_string(i)+"\n").c_str());
+    assert(readQuals && ("readQuals is NULL at index : "+to_string(i)+"\n").c_str());
+#endif
+    jsize readLength = env->GetArrayLength(readBases);
+
+    jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy);	//order of GET-RELEASE is important
+    jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy);
+    jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy);
+    jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy);
+    jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy);
+#ifdef DEBUG
+    assert(readBasesArray && "readBasesArray not initialized in JNI"); 
+    assert(readQualsArray && "readQualsArray not initialized in JNI"); 
+    assert(insertionGOPArray && "insertionGOP array not initialized in JNI");
+    assert(deletionGOPArray && "deletionGOP array not initialized in JNI");
+    assert(overallGCPArray && "overallGCP array not initialized in JNI");
+    assert(readLength < MROWS); 
+    assert(readLength == env->GetArrayLength(readQuals));
+    assert(readLength == env->GetArrayLength(insertionGOP));
+    assert(readLength == env->GetArrayLength(deletionGOP));
+    assert(readLength == env->GetArrayLength(overallGCP));
+#ifdef DEBUG0_1
+    cout << "JNI read length "<<readLength<<"\n";
+#endif
+#endif
+#ifdef DEBUG3
+    for(unsigned j=0;j<readLength;++j)
+    {
+      debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
+      debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
+    }
+#endif
+
+    for(unsigned j=0;j<numHaplotypes;++j)
+    {
+      jsize haplotypeLength = env->GetArrayLength(haplotypeBasesArrayVector[j].first);
+      jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
+      tc_array[tc_idx].rslen = (int)readLength;
+      tc_array[tc_idx].haplen = (int)haplotypeLength;
+      tc_array[tc_idx].rs = (char*)readBasesArray;
+      tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
+      //Can be avoided 
+      for(unsigned k=0;k<readLength;++k)
+      {
+	tc_array[tc_idx].q[k] = (int)readQualsArray[k];
+	tc_array[tc_idx].i[k] = (int)insertionGOPArray[k];
+	tc_array[tc_idx].d[k] = (int)deletionGOPArray[k];
+	tc_array[tc_idx].c[k] = (int)overallGCPArray[k];
+      }
+
+      ++tc_idx;  
+    }
+    RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);	//order of GET-RELEASE is important
+    RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
+    RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
+    RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
+
+    //Release readBases at end because it is used by compute_full_prob
+    readBasesArrayVector[i].clear();
+    readBasesArrayVector[i].resize(1);
+    readBasesArrayVector[i][0] = make_pair(readBases, readBasesArray);
+  }
+
+  jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy);
+  assert(likelihoodDoubleArray && "likelihoodArray is NULL");
+  assert(env->GetArrayLength(likelihoodArray) == numTestCases);
+#pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) 
+  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
+  {
+    double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
+    double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+    likelihoodDoubleArray[tc_idx] = result;
+  }
+#ifdef DEBUG
+  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
+  {
+    debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
+  }
+#endif
+  RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0);
+
+  //Release read arrays first
+  for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
+  {
+    for(int j=readBasesArrayVector.size()-1;j>=0;--j)
+      RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE);
+    readBasesArrayVector[i].clear();
+  }
+  readBasesArrayVector.clear();
+
+  //Now release haplotype arrays
+  for(int j=numHaplotypes-1;j>=0;--j)	//note the order - reverse of GET
+    RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE);
+  haplotypeBasesArrayVector.clear();
+  tc_array.clear();
+}
+
--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
@ -0,0 +1,76 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM */
+
+#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+#define _Included_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+#ifdef __cplusplus
+extern "C" {
+#endif
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_TRISTATE_CORRECTION 3.0
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToMatch 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_indelToMatch 1L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToInsertion 2L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_insertionToInsertion 3L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitialize
+ * Signature: (II)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitialize
+  (JNIEnv *, jobject, jint, jint);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitializeProbabilities
+ * Signature: ([[D[B[B[B)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
+  (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniInitializePriorsAndUpdateCells
+ * Signature: (ZII[B[B[BI)D
+ */
+JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells
+  (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint);
+
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniSubComputeReadLikelihoodGivenHaplotypeLog10
+ * Signature: (II[B[B[B[B[B[BI)D
+ */
+JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10
+    (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniGlobalInit
+ * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
+  (JNIEnv *, jobject, jclass, jclass);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniComputeLikelihoods
+ * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
+  (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/PairHMM_JNI/pairhmm-1-base.cc
+++ b/PairHMM_JNI/pairhmm-1-base.cc
@ -0,0 +1,238 @@
+//#define DEBUG 1
+//#define DEBUG0_1 1
+//#define DEBUG3 1
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <omp.h>
+#include <emmintrin.h>
+#include "template.h"
+
+//#include "define-float.h"
+//#include "shift_template.c"
+//#include "pairhmm-template-kernel.cc"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+
+using namespace std;
+class LoadTimeInitializer
+{
+  public:
+    LoadTimeInitializer()		//will be called when library is loaded
+    {
+      ConvertChar::init();
+    }
+};
+LoadTimeInitializer g_load_time_initializer;
+
+
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
+{
+	int r, c;
+	int ROWS = tc->rslen + 1;
+	int COLS = tc->haplen + 1;
+
+	Context<NUMBER> ctx;
+
+	NUMBER M[MROWS][MCOLS];
+	NUMBER X[MROWS][MCOLS];
+	NUMBER Y[MROWS][MCOLS];
+	NUMBER p[MROWS][6];
+
+	p[0][MM] = ctx._(0.0);
+	p[0][GapM] = ctx._(0.0);
+	p[0][MX] = ctx._(0.0);
+	p[0][XX] = ctx._(0.0);
+	p[0][MY] = ctx._(0.0);
+	p[0][YY] = ctx._(0.0);
+	for (r = 1; r < ROWS; r++)
+	{
+		int _i = tc->i[r-1] & 127;
+		int _d = tc->d[r-1] & 127;
+		int _c = tc->c[r-1] & 127;
+		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
+		p[r][MX] = ctx.ph2pr[_i];
+		p[r][XX] = ctx.ph2pr[_c];
+		p[r][MY] = ctx.ph2pr[_d];
+		p[r][YY] = ctx.ph2pr[_c];
+		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+	}
+
+	for (c = 0; c < COLS; c++)
+	{
+		M[0][c] = ctx._(0.0);
+		X[0][c] = ctx._(0.0);
+		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
+	}
+
+	for (r = 1; r < ROWS; r++)
+	{
+		M[r][0] = ctx._(0.0);
+		X[r][0] = X[r-1][0] * p[r][XX];
+		Y[r][0] = ctx._(0.0);
+	}
+
+	 NUMBER result = ctx._(0.0);
+
+	for (r = 1; r < ROWS; r++)
+		for (c = 1; c < COLS; c++)
+		{
+			char _rs = tc->rs[r-1];
+			char _hap = tc->hap[c-1];
+			int _q = tc->q[r-1] & 127;
+			NUMBER distm = ctx.ph2pr[_q];
+			if (_rs == _hap || _rs == 'N' || _hap == 'N')
+				distm = ctx._(1.0) - distm;
+			else
+			  distm = distm/3;
+			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
+			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
+			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
+		}
+
+	for (c = 0; c < COLS; c++)
+	{
+		result += M[ROWS-1][c] + X[ROWS-1][c];
+	}
+
+	if (before_last_log != NULL)
+		*before_last_log = result;
+
+	return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
+}
+
+#define BATCH_SIZE  10000
+#define RUN_HYBRID
+
+int main(int argc, char** argv)
+{
+  if(argc < 2)
+  {
+    cerr << "Needs path to input file as argument\n";
+    exit(0);
+  }
+  bool use_old_read_testcase = false;
+  if(argc >= 3 && string(argv[2]) == "1")
+    use_old_read_testcase = true;
+
+  testcase tc;
+  if(use_old_read_testcase)
+  {
+    FILE* fptr = fopen(argv[1],"r");
+    while(!feof(fptr))
+    {
+      if(read_testcase(&tc, fptr) >= 0)
+      {
+	double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+	double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+
+	cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
+	delete tc.rs;
+	delete tc.hap;
+      }
+    }
+    fclose(fptr);
+  }
+  else
+  {
+    std::ifstream ifptr;
+    std::vector<std::string> tokens;
+    ifptr.open(argv[1]);
+    assert(ifptr.is_open());
+    while(1)
+    {
+      tokens.clear();
+      if(read_mod_testcase(ifptr, &tc, false) < 0)
+	break;
+      //double result = 0;
+      double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+      double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+
+      cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
+      delete tc.rs;
+      delete tc.hap;
+    }
+    ifptr.close();
+  }
+  return 0;  
+
+#if 0
+  float result[BATCH_SIZE], result_avxf;
+  double result_avxd;
+  struct timeval start, end;
+  long long aggregateTimeRead = 0L;
+  long long aggregateTimeCompute = 0L;
+  long long aggregateTimeWrite = 0L;
+
+  bool noMoreData = false;
+  int count =0;
+  while (!noMoreData)
+  {
+    int read_count = BATCH_SIZE;
+    gettimeofday(&start, NULL);
+    for (int b=0;b<BATCH_SIZE;b++)
+      if (read_testcase(&tc[b], NULL)==-1)
+      {
+	read_count = b;
+	noMoreData = true;
+	break;
+      }
+    gettimeofday(&end, NULL);
+    aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+    gettimeofday(&start, NULL);
+    for (int b=0;b<read_count;b++)
+    {
+      result_avxf = compute_full_prob<float>(&tc[b]);
+
+#ifdef RUN_HYBRID
+#define MIN_ACCEPTED 1e-28f
+      if (result_avxf < MIN_ACCEPTED) {
+	count++;
+	result_avxd = compute_full_prob<double>(&tc[b]);
+	result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+      }
+      else
+	result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif
+
+#ifndef RUN_HYBRID
+      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+#endif
+
+    }
+    gettimeofday(&end, NULL);
+    aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+    gettimeofday(&start, NULL);
+    for (int b=0;b<read_count;b++)
+      printf("%E\n", result[b]);
+    gettimeofday(&end, NULL);
+    aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+  }
+
+  printf("AVX Read Time: %ld\n", aggregateTimeRead);
+  printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
+  printf("AVX Write Time: %ld\n", aggregateTimeWrite);
+  printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+  printf("# Double called: %d\n", count);
+
+  return 0;
+#endif
+}
+
--- a/PairHMM_JNI/pairhmm-template-kernel.cc
+++ b/PairHMM_JNI/pairhmm-template-kernel.cc
@ -0,0 +1,417 @@
+#ifdef PRECISION
+
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+
+/*
+template <class T>
+string getBinaryStr (T val, int numBitsToWrite) {
+  
+  ostringstream oss ;
+  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
+  for (int i=numBitsToWrite-1; i >= 0; --i) {
+    oss << ((val & mask) >> i) ;
+    mask >>= 1 ;
+  }
+  return oss.str() ;
+}
+*/
+
+void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
+  
+  const int maskBitCnt = MAIN_TYPE_SIZE ;
+
+  for (int vi=0; vi < numMaskVecs; ++vi) {
+    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
+      maskArr[vi][rs] = 0 ;
+    }
+    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
+  }
+ 
+  for (int col=1; col < COLS; ++col) {
+    int mIndex = (col-1) / maskBitCnt ;
+    int mOffset = (col-1) % maskBitCnt ;
+    MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
+
+    char hapChar = tc.hap[col-1] ;
+
+    if (hapChar == AMBIG_CHAR) {
+      for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) 
+	maskArr[mIndex][ci] |= bitMask ;
+    } 
+
+    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+    // bit corresponding to col 1 will be the MSB of the mask 0
+    // bit corresponding to col 2 will be the MSB-1 of the mask 0
+    // ...
+    // bit corresponding to col 32 will be the LSB of the mask 0
+    // bit corresponding to col 33 will be the MSB of the mask 1
+    // ...
+  }
+
+}
+
+void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
+
+  for (int ri=0; ri < numRowsToProcess; ++ri) {
+    rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
+  }
+
+  for (int ei=0; ei < AVX_LENGTH; ++ei) {
+    lastMaskShiftOut[ei] = 0 ;
+  }
+}
+
+#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
+  MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ;			\
+  MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
+  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
+  __lastShiftOut = __nextShiftOut ;					\
+}
+
+
+void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) {
+
+  for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
+    SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], 
+		  lastMaskShiftOut[ei], ei, maskBitCnt) ;
+    
+    int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
+    SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], 
+		  lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+  }
+
+}
+
+//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
+
+inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
+  //#define computeDistVec() {					      
+
+  _256_TYPE maskV ;
+  VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ;
+
+  distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ;
+
+  /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
+
+  VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ;
+  VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ;
+}								     
+
+
+/*
+template<class NUMBER>
+struct HmmData {
+  int ROWS ;
+  int COLS ;
+  
+  NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH] ;
+  Context<NUMBER> ctx ;
+  testcase* tc ;
+  _256_TYPE p_MM[MAVX_COUNT], p_GAPM[MAVX_COUNT], p_MX[MAVX_COUNT], p_XX[MAVX_COUNT], p_MY[MAVX_COUNT], p_YY[MAVX_COUNT], distm1D[MAVX_COUNT] ;
+  _256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY ;
+
+  UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y ;
+  UNION_TYPE rs , rsN ;
+  _256_TYPE distmSel;
+  _256_TYPE distm, _1_distm;
+
+} ;
+*/
+
+template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
+{
+	NUMBER zero = ctx._(0.0);
+        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+        for (int s=0;s<ROWS+COLS+AVX_LENGTH;s++)
+        {
+                shiftOutM[s] = zero;
+                shiftOutX[s] = zero;
+                shiftOutY[s] = init_Y;
+        }
+
+        NUMBER *ptr_p_MM = (NUMBER *)p_MM;
+        NUMBER *ptr_p_XX = (NUMBER *)p_XX;
+        NUMBER *ptr_p_YY = (NUMBER *)p_YY;
+        NUMBER *ptr_p_MX = (NUMBER *)p_MX;
+        NUMBER *ptr_p_MY = (NUMBER *)p_MY;
+        NUMBER *ptr_p_GAPM = (NUMBER *)p_GAPM;
+
+        *ptr_p_MM = ctx._(0.0);
+        *ptr_p_XX = ctx._(0.0);
+        *ptr_p_YY = ctx._(0.0);
+        *ptr_p_MX = ctx._(0.0);
+        *ptr_p_MY = ctx._(0.0);
+        *ptr_p_GAPM = ctx._(0.0);
+
+
+        for (int r = 1; r < ROWS; r++)
+        {
+                int _i = tc->i[r-1] & 127;
+                int _d = tc->d[r-1] & 127;
+                int _c = tc->c[r-1] & 127;
+
+                *(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+                *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
+                *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
+                *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
+                *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
+                *(ptr_p_YY+r-1) = ctx.ph2pr[_c];
+#ifdef DEBUG3
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1)  ),true);
+		debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1)  ),true);
+#endif
+		//*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+        }
+
+        NUMBER *ptr_distm1D = (NUMBER *)distm1D;
+        for (int r = 1; r < ROWS; r++)
+        {
+                int _q = tc->q[r-1] & 127;
+                ptr_distm1D[r-1] = ctx.ph2pr[_q];
+#ifdef DEBUG3
+		debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true);
+#endif
+        }
+}
+
+
+template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(
+			int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
+			_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
+			_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
+			UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM)	 
+{
+	int i = stripIdx;
+        pGAPM = p_GAPM[i];                                               
+        pMM   = p_MM[i];                                                 
+        pMX   = p_MX[i];                                                 
+        pXX   = p_XX[i];                                                 
+        pMY   = p_MY[i];                                                 
+        pYY   = p_YY[i];               
+
+	NUMBER zero = ctx._(0.0);        
+	NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+	UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+#define TRISTATE_CORRECTION_FACTOR 3.0
+	UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR);
+	
+        /* compare rs and N */                                           
+        //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
+        //rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
+                                                                         
+        distm = distm1D[i];                                    
+        _1_distm = VEC_SUB(packed1.d, distm);
+#ifndef DO_NOT_USE_TRISTATE_CORRECTION
+	distm = VEC_DIV(distm, packed3.d); 
+#endif
+                                                                         
+        /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */        
+        M_t_2.d = VEC_SET1_VAL(zero);                                    
+        X_t_2.d = VEC_SET1_VAL(zero);                                    
+                                                                         
+        if (i==0) {                                                      
+                M_t_1.d = VEC_SET1_VAL(zero);                            
+                X_t_1.d = VEC_SET1_VAL(zero);                            
+                Y_t_2.d = VEC_SET_LSE(init_Y);                           
+                Y_t_1.d = VEC_SET1_VAL(zero);                            
+        }                                                                
+        else {                                                           
+                X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]);            
+                M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]);            
+                Y_t_2.d = VEC_SET1_VAL(zero);                            
+                Y_t_1.d = VEC_SET1_VAL(zero);                            
+        }                                                                
+        M_t_1_y = M_t_1;
+}        
+
+
+
+inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
+						_256_TYPE distm, _256_TYPE _1_distm)
+{
+        UNION_TYPE hapN, rshap;
+        _256_TYPE  cond;
+	IF_32 shiftInHap;
+
+	int *hap_ptr = tc->ihap;
+
+        shiftInHap.i = (d<COLS) ? hap_ptr[d-1] : hap_ptr[COLS-1];              
+
+        /* shift hap */                                                        
+        SHIFT_HAP(hap, shiftInHap);                                            
+        _256_TYPE hapF = VEC_CVT_128_256(hap);                                 
+
+        rshap.d = VEC_CMP_EQ(rs, hapF);                                        
+        hapN.d  = VEC_CMP_EQ(N_packed256, hapF);                               
+
+        /* OR rsN, rshap, hapN */                                              
+        cond =  VEC_OR(rsN.d, rshap.d);                                        
+        cond =  VEC_OR(cond, hapN.d);                                          
+        
+        /* distm1D = (cond) ? 1-distm1D : distm1D;  */                         
+        _256_TYPE distmSel = VEC_BLENDV(distm, _1_distm, cond);
+        
+        return distmSel;
+}        
+
+
+inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+			UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
+			_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
+{
+	/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
+	M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
+	M_t_y = M_t;
+
+	/* Compute X_t */
+	X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX));
+
+	/* Compute Y_t */
+	Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
+}
+
+template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
+{
+        _256_TYPE p_MM   [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX   [MAVX_COUNT];
+	_256_TYPE p_XX   [MAVX_COUNT], p_MY   [MAVX_COUNT], p_YY   [MAVX_COUNT];
+	_256_TYPE distm1D[MAVX_COUNT];
+	NUMBER shiftOutM[MROWS+MCOLS+AVX_LENGTH], shiftOutX[MROWS+MCOLS+AVX_LENGTH], shiftOutY[MROWS+MCOLS+AVX_LENGTH];
+	UNION_TYPE  M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y;
+	_256_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY;
+
+        struct timeval start, end;
+        NUMBER result_avx2;
+        Context<NUMBER> ctx;
+        UNION_TYPE rs , rsN;
+        HAP_TYPE hap;
+        _256_TYPE distmSel, distmChosen ;
+	_256_TYPE distm, _1_distm;
+
+        int r, c;
+        int ROWS = tc->rslen + 1;
+        int COLS = tc->haplen + 1;
+        int AVX_COUNT = (ROWS+7)/8;
+        NUMBER zero = ctx._(0.0);
+        UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
+	_256_TYPE N_packed256 = VEC_POPCVT_CHAR('N');
+        NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
+        int remainingRows = (ROWS-1) % AVX_LENGTH;
+        int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
+
+	const int maskBitCnt = MAIN_TYPE_SIZE ;
+	const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
+
+	MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
+	GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
+
+	char rsArr[AVX_LENGTH] ;
+	MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
+
+	GEN_INTRINSIC(initializeVectors, PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, 
+							    ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
+
+	//for (int __ii=0; __ii < 10; ++__ii)
+        for (int i=0;i<strip_cnt-1;i++)
+        {
+		//STRIP_INITIALIZATION
+		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+							      p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
+
+		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
+							      i*AVX_LENGTH+1, AVX_LENGTH) ;
+		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+		MASK_VEC currMaskVecLow ; // corresponding to lower half
+		MASK_VEC currMaskVecHigh ; // corresponding to upper half
+
+                for (int d=1;d<COLS+AVX_LENGTH;d++)
+                {
+		        if (d % MAIN_TYPE_SIZE == 1)
+			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+
+			int ShiftIdx = d+AVX_LENGTH;
+			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+
+			//int firstRowIndex = (d < COLS) ? 0 : (d-COLS+1) ;
+			//int lastRowIndex = std::min(d-1, AVX_LENGTH-1) ;
+			//GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen, distmSel, firstRowIndex, lastRowIndex) ;
+
+			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
+			
+
+			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, 
+								pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[d]);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[d]);
+
+                        GEN_INTRINSIC(_vector_shift, PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[d]);
+
+                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+		}
+	}
+
+        int i = strip_cnt-1;
+        {
+		//STRIP_INITIALIZATION
+		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+                        p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
+
+                if (remainingRows==0) remainingRows=AVX_LENGTH;
+
+		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
+							      i*AVX_LENGTH+1, remainingRows) ;
+
+                _256_TYPE sumM, sumX;
+                sumM = VEC_SET1_VAL(zero);
+                sumX = VEC_SET1_VAL(zero);
+
+		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
+		MASK_VEC currMaskVecLow ; // corresponding to lower half
+		MASK_VEC currMaskVecHigh ; // corresponding to upper half
+
+                for (int d=1;d<COLS+remainingRows-1;d++)
+                {
+
+  	 	        if (d % MAIN_TYPE_SIZE == 1)
+			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+
+			int ShiftIdx = d+AVX_LENGTH;
+			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
+
+			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
+					                        pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+
+                        sumM  = VEC_ADD(sumM, M_t.d);
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(M_t, shiftOutM[ShiftIdx]);
+
+                        sumX  = VEC_ADD(sumX, X_t.d);
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(X_t, shiftOutX[ShiftIdx]);
+
+                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
+
+                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+                }
+                UNION_TYPE sumMX;
+                sumMX.d = VEC_ADD(sumM, sumX);
+                result_avx2 = sumMX.f[remainingRows-1];		
+	}
+	//printf("result_avx2: %f\n", result_avx2);
+	return result_avx2;
+}
+
+#endif
+
--- a/PairHMM_JNI/pairhmm-template-main.cc
+++ b/PairHMM_JNI/pairhmm-template-main.cc
@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+
+#include "template.h"
+
+#include "define-float.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#define BATCH_SIZE  10000
+//#define RUN_HYBRID
+
+//uint8_t ConvertChar::conversionTable[255] ;
+int thread_level_parallelism_enabled = false ;
+
+double getCurrClk() {
+  struct timeval tv ;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+
+int main()
+{
+        
+	testcase tc[BATCH_SIZE];
+        float result[BATCH_SIZE], result_avxf;
+        double result_avxd;
+        //struct timeval start, end;
+	double lastClk = 0.0 ;
+        double aggregateTimeRead = 0.0;
+        double aggregateTimeCompute = 0.0;
+        double aggregateTimeWrite = 0.0;
+
+	// Need to call it once to initialize the static array
+	ConvertChar::init() ;
+
+
+	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
+	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
+	  thread_level_parallelism_enabled = true ;
+	}
+
+        bool noMoreData = false;
+        int count =0;
+        while (!noMoreData)
+        {
+                int read_count = BATCH_SIZE;
+ 
+		lastClk = getCurrClk() ;
+                for (int b=0;b<BATCH_SIZE;b++)
+                        if (read_testcase(&tc[b], NULL)==-1)
+                        {
+                                read_count = b;
+                                noMoreData = true;
+                                break;
+                        }
+                //gettimeofday(&end, NULL);
+                aggregateTimeRead += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                //gettimeofday(&start, NULL);
+		lastClk = getCurrClk() ;
+
+#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
+                for (int b=0;b<read_count;b++)
+                {
+                        result_avxf = GEN_INTRINSIC(compute_full_prob_avx, s)<float>(&tc[b]);
+
+                        #ifdef RUN_HYBRID
+                                #define MIN_ACCEPTED 1e-28f
+                                if (result_avxf < MIN_ACCEPTED) {
+                                      count++;
+                                      result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc[b]);
+                                      result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+                                }
+                                else
+                                      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                        #ifndef RUN_HYBRID
+                                result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
+                        #endif
+
+                }
+                //gettimeofday(&end, NULL);
+                aggregateTimeCompute += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+                //gettimeofday(&start, NULL);
+		lastClk = getCurrClk() ;
+                for (int b=0;b<read_count;b++)
+                        printf("%E\n", result[b]);
+                //gettimeofday(&end, NULL);
+                aggregateTimeWrite += (getCurrClk() - lastClk) ;
+		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
+
+        }
+
+        printf("AVX Read Time: %.2f\n", aggregateTimeRead);
+        printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
+        printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
+        printf("AVX Total Time: %.2f\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
+        printf("# Double called: %d\n", count);
+
+        return 0;
+}
+
+
+
--- a/PairHMM_JNI/shift_template.c
+++ b/PairHMM_JNI/shift_template.c
@ -0,0 +1,68 @@
+#ifdef PRECISION 
+
+#include "template.h"
+
+
+inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+{
+
+        IF_128 xlow , xhigh;
+        /* cast x to xlow */
+        xlow.f = VEC_CAST_256_128(x.d);
+        /* extract x,1 to xhigh */
+        xhigh.f = VEC_EXTRACT_128(x.d, 1);
+        /* extract xlow[3] */
+        IF_128 shiftOutL128;
+        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+        /* extract xhigh[3] */
+        IF_MAIN_TYPE shiftOutH;
+        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
+        shiftOut = shiftOutH.f;
+        /* shift xlow */
+        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+        /* shift xhigh */
+        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+        /*movss shiftIn to xlow[0] */
+        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+        /*movss xlow[3] to xhigh[0] */
+        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+        /* cast xlow to x */
+        x.d = VEC_CAST_128_256(xlow.f);
+        /* insert xhigh to x,1 */
+        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+}
+
+inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+{
+
+        IF_128 xlow , xhigh;
+        /* cast x to xlow */
+        xlow.f = VEC_CAST_256_128(x.d);
+        /* extract x,1 to xhigh */
+        xhigh.f = VEC_EXTRACT_128(x.d, 1);
+        /* extract xlow[3] */
+        IF_128 shiftOutL128;
+        shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
+        /* extract xhigh[3] */
+//        IF_MAIN_TYPE shiftOutH;
+//        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
+//        shiftOut = shiftOutH.f;
+        /* shift xlow */
+        xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
+        /* shift xhigh */
+        xhigh.i = _mm_slli_si128 (xhigh.i, SHIFT_CONST3);
+        /*movss shiftIn to xlow[0] */
+        _128_TYPE shiftIn128 = VEC_SET1_VAL128(shiftIn);
+        xlow.f = VEC_MOVE(xlow.f , shiftIn128);
+        /*movss xlow[3] to xhigh[0] */
+        xhigh.f = VEC_MOVE(xhigh.f, shiftOutL128.f);
+        /* cast xlow to x */
+        x.d = VEC_CAST_128_256(xlow.f);
+        /* insert xhigh to x,1 */
+        x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
+}
+
+
+#endif
+
--- a/PairHMM_JNI/template.h
+++ b/PairHMM_JNI/template.h
@ -0,0 +1,190 @@
+#ifndef TEMPLATES_H_
+#define TEMPLATES_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <sys/time.h>
+
+#include <immintrin.h>
+
+#include <ctype.h>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
+
+#define MROWS  500
+#define MCOLS  1000
+
+#define CAT(X,Y) X####Y
+#define GEN_INTRINSIC(X,Y) CAT(X,Y)
+
+#define ALIGNED __attribute__((aligned(32)))
+
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m256 ALIGNED d;
+        ALIGNED __m128i ALIGNED s[2];
+        ALIGNED float  ALIGNED f[8];
+        ALIGNED __m256i ALIGNED i;
+} ALIGNED mix_F ALIGNED;
+
+typedef union ALIGNED {
+  __m128i vec ;
+  __m128 vecf ;
+  uint32_t masks[4] ;
+} MaskVec_F ;
+
+typedef union ALIGNED
+{
+        ALIGNED __m128i ALIGNED i;
+        ALIGNED __m128 ALIGNED f;
+} ALIGNED IF_128f ALIGNED;
+
+typedef union ALIGNED
+{
+        ALIGNED int    ALIGNED i;
+        ALIGNED float  ALIGNED f;
+} ALIGNED IF_32 ALIGNED;
+
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m256d ALIGNED d;
+        ALIGNED __m128i ALIGNED s[2];
+        ALIGNED double  ALIGNED f[4];
+        ALIGNED __m256i ALIGNED i;
+} ALIGNED mix_D ALIGNED;
+
+typedef union ALIGNED {
+  __m128i vec ;
+  __m128d vecf ;
+  uint64_t masks[2] ;
+} MaskVec_D ;
+
+typedef union ALIGNED
+{
+        ALIGNED __m128i ALIGNED i;
+        ALIGNED __m128d ALIGNED f;
+} ALIGNED IF_128d ALIGNED;
+
+typedef union ALIGNED
+{
+        ALIGNED int64_t ALIGNED i;
+        ALIGNED double  ALIGNED f;
+} ALIGNED IF_64 ALIGNED;
+
+template<class T>
+struct Context{};
+
+template<>
+struct Context<double>
+{
+        Context()
+        {
+                for (int x = 0; x < 128; x++)
+                        ph2pr[x] = pow(10.0, -((double)x) / 10.0);
+
+                INITIAL_CONSTANT = ldexp(1.0, 1020.0);
+                LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
+                RESULT_THRESHOLD = 0.0;
+        }
+
+        double LOG10(double v){ return log10(v); }
+
+        static double _(double n){ return n; }
+        static double _(float n){ return ((double) n); }
+        double ph2pr[128];
+        double INITIAL_CONSTANT;
+        double LOG10_INITIAL_CONSTANT;
+        double RESULT_THRESHOLD;
+};
+
+template<>
+struct Context<float>
+{
+        Context()
+        {
+                for (int x = 0; x < 128; x++)
+                {
+                        ph2pr[x] = powf(10.f, -((float)x) / 10.f);
+                }
+
+                INITIAL_CONSTANT = ldexpf(1.f, 120.f);
+                LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
+                RESULT_THRESHOLD = ldexpf(1.f, -110.f);
+        }
+
+        float LOG10(float v){ return log10f(v); }
+
+        static float _(double n){ return ((float) n); }
+        static float _(float n){ return n; }
+        float ph2pr[128];
+        float INITIAL_CONSTANT;
+        float LOG10_INITIAL_CONSTANT;
+        float RESULT_THRESHOLD;
+};
+
+
+typedef struct
+{
+        int rslen, haplen;
+	/*int *q, *i, *d, *c;*/
+	int q[MROWS], i[MROWS], d[MROWS], c[MROWS];
+        char *hap, *rs;
+	int *ihap;
+	int *irs;
+} testcase;
+
+
+template<class T>
+std::string to_string(T obj)
+{
+  std::stringstream ss;
+  std::string ret_string;
+  ss.clear();
+  ss << std::scientific << obj;
+  ss >> ret_string;
+  ss.clear();
+  return ret_string;
+}
+void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
+
+int normalize(char c);
+int read_testcase(testcase *tc, FILE* ifp);
+int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
+
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+class ConvertChar {
+
+  static uint8_t conversionTable[255] ;
+
+public:
+
+  static void init() {
+    assert (NUM_DISTINCT_CHARS == 5) ;
+    assert (AMBIG_CHAR == 4) ;
+
+    conversionTable['A'] = 0 ;
+    conversionTable['C'] = 1 ;
+    conversionTable['T'] = 2 ;
+    conversionTable['G'] = 3 ;
+    conversionTable['N'] = 4 ;
+  }
+
+  static inline uint8_t get(uint8_t input) {
+    return conversionTable[input] ;
+  }
+
+} ;
+
+
+#endif
+
+