diff --git a/.gitignore b/.gitignore
index 65f111587..ede62ef81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,16 @@ dump/
 lib/
 out/
 /atlassian-ide-plugin.xml
+kg_tmp/
+maven-ant-tasks-2.1.3.jar
+null-sequenceGraph.25.0.0.raw_readthreading_graph.dot
+null-sequenceGraph.25.0.1.cleaned_readthreading_graph.dot
+null-sequenceGraph.25.0.1.initial_seqgraph.dot
+null-sequenceGraph.3.0.0.raw_readthreading_graph.dot
+null-sequenceGraph.3.0.1.cleaned_readthreading_graph.dot
+null-sequenceGraph.3.0.1.initial_seqgraph.dot
+org/
+package-list
+resources/
+velocity.log
+
diff --git a/PairHMM_JNI/.gitignore b/PairHMM_JNI/.gitignore
index 9722ad970..ce903a6fa 100644
--- a/PairHMM_JNI/.gitignore
+++ b/PairHMM_JNI/.gitignore
@@ -1,6 +1,6 @@
 .svn
 *.o
-#*.so
+*.so
 tests
 .deps
 hmm_Mohammad
@@ -8,3 +8,6 @@ pairhmm-template-main
 *.swp
 checker
 reformat
+subdir_checkout.sh
+avx/
+sse/
diff --git a/PairHMM_JNI/LoadTimeInitializer.cc b/PairHMM_JNI/LoadTimeInitializer.cc
new file mode 100644
index 000000000..534043371
--- /dev/null
+++ b/PairHMM_JNI/LoadTimeInitializer.cc
@@ -0,0 +1,76 @@
+#include "LoadTimeInitializer.h"
+#include "utils.h"
+using namespace std;
+
+LoadTimeInitializer g_load_time_initializer;
+
+LoadTimeInitializer::LoadTimeInitializer()		//will be called when library is loaded
+{
+  ConvertChar::init();
+  m_sumNumReads = 0;
+  m_sumSquareNumReads = 0;
+  m_sumNumHaplotypes = 0;
+  m_sumSquareNumHaplotypes = 0;
+  m_sumNumTestcases = 0;
+  m_sumSquareNumTestcases = 0;
+  m_maxNumTestcases = 0;
+  m_num_invocations = 0;
+  m_compute_time = 0;
+  m_data_transfer_time = 0;
+
+  m_filename_to_fptr.clear();
+
+  initialize_function_pointers();
+  cout.flush();
+}
+
+void LoadTimeInitializer::print_profiling()
+{
+  double mean_val;
+  cout << "Compute time "<<m_compute_time*1e-9<<"\n";
+  cout << "Data initialization time "<<m_data_transfer_time*1e-9<<"\n";
+  cout <<"Invocations : "<<m_num_invocations<<"\n";
+  cout << "term\tsum\tsumSq\tmean\tvar\tmax\n";
+  mean_val = m_sumNumReads/m_num_invocations;
+  cout << "reads\t"<<m_sumNumReads<<"\t"<<m_sumSquareNumReads<<"\t"<<mean_val<<"\t"<<
+    (m_sumSquareNumReads/m_num_invocations)-mean_val*mean_val<<"\n";
+  mean_val = m_sumNumHaplotypes/m_num_invocations;
+  cout << "haplotypes\t"<<m_sumNumHaplotypes<<"\t"<<m_sumSquareNumHaplotypes<<"\t"<<mean_val<<"\t"<<
+    (m_sumSquareNumHaplotypes/m_num_invocations)-mean_val*mean_val<<"\n";
+  mean_val = m_sumNumTestcases/m_num_invocations;
+  cout << "numtestcases\t"<<m_sumNumTestcases<<"\t"<<m_sumSquareNumTestcases<<"\t"<<mean_val<<"\t"<<
+    (m_sumSquareNumTestcases/m_num_invocations)-mean_val*mean_val<<"\t"<<m_maxNumTestcases<<"\n";
+  cout.flush();
+}
+
+void LoadTimeInitializer::debug_dump(string filename, string s, bool to_append, bool add_newline)
+{
+  map<string, ofstream*>::iterator mI = m_filename_to_fptr.find(filename);
+  ofstream* fptr = 0;
+  if(mI == m_filename_to_fptr.end())
+  {
+    m_filename_to_fptr[filename] = new ofstream();
+    fptr = m_filename_to_fptr[filename];
+    fptr->open(filename.c_str(), to_append ? ios::app : ios::out);
+    assert(fptr->is_open());
+  }
+  else
+    fptr = (*mI).second;
+  //ofstream fptr;
+  //fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
+  (*fptr) << s;
+  if(add_newline)
+    (*fptr) << "\n";
+  //fptr.close();
+}
+void LoadTimeInitializer::debug_close()
+{
+  for(map<string,ofstream*>::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end();
+      mB != mE;mB++)
+  {
+    (*mB).second->close();
+    delete (*mB).second;
+  }
+  m_filename_to_fptr.clear();
+}
+
diff --git a/PairHMM_JNI/LoadTimeInitializer.h b/PairHMM_JNI/LoadTimeInitializer.h
new file mode 100644
index 000000000..cbda057a8
--- /dev/null
+++ b/PairHMM_JNI/LoadTimeInitializer.h
@@ -0,0 +1,37 @@
+#ifndef LOAD_TIME_INITIALIZER_H
+#define LOAD_TIME_INITIALIZER_H
+#include "headers.h"
+#include <jni.h>
+class LoadTimeInitializer
+{
+  public:
+    LoadTimeInitializer();		//will be called when library is loaded
+    void print_profiling();
+    void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
+    void debug_close();
+    
+    jfieldID m_readBasesFID;
+    jfieldID m_readQualsFID;
+    jfieldID m_insertionGOPFID;
+    jfieldID m_deletionGOPFID;
+    jfieldID m_overallGCPFID;
+    jfieldID m_haplotypeBasesFID;
+    //used to compute avg, variance of #testcases
+    double m_sumNumReads;
+    double m_sumSquareNumReads;
+    double m_sumNumHaplotypes;
+    double m_sumSquareNumHaplotypes;
+    double m_sumNumTestcases;
+    double m_sumSquareNumTestcases;
+    unsigned m_maxNumTestcases;
+    unsigned m_num_invocations;
+    //timing
+    uint64_t m_compute_time;
+    uint64_t m_data_transfer_time;
+  private:
+    std::map<std::string, std::ofstream*> m_filename_to_fptr;
+};
+extern LoadTimeInitializer g_load_time_initializer;
+
+
+#endif
diff --git a/PairHMM_JNI/Makefile b/PairHMM_JNI/Makefile
index 70bf96dd3..9b461a833 100644
--- a/PairHMM_JNI/Makefile
+++ b/PairHMM_JNI/Makefile
@@ -1,5 +1,5 @@
-OMPCFLAGS=-fopenmp
-#OMPLDFLAGS=-lgomp
+#OMPCFLAGS=-fopenmp
+#OMPLFLAGS=-fopenmp #-openmp-link static
 
 #CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q            -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
 #CFLAGS=-O2             -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
@@ -7,40 +7,59 @@ OMPCFLAGS=-fopenmp
 JAVA_ROOT=/opt/jdk1.7.0_25/
 JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux
 
-CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
-
-CXXFLAGS=$(CFLAGS)
+COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
 CC=icc
 CXX=icc
 
-LDFLAGS=-lm $(OMPLDFLAGS)
+LDFLAGS=-lm -lrt $(OMPLDFLAGS)
 
-#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
-BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker
+BIN=libJNILoglessPairHMM.so pairhmm-template-main checker
+#BIN=checker
 
-#SOURCES=pairhmm-1-base.cc input.cc 
-LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc
-SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
-LIBOBJECTS=$(LIBSOURCES:.cc=.o)
 DEPDIR=.deps
 DF=$(DEPDIR)/$(*).d
 
+#Common across libJNI and sandbox
+COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc sse_function_instantiations.cc LoadTimeInitializer.cc
+#Part of libJNI
+LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc $(COMMON_SOURCES)
+SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
+LIBOBJECTS=$(LIBSOURCES:.cc=.o)
+COMMON_OBJECTS=$(COMMON_SOURCES:.cc=.o)
+
+
+#No vectorization for these files
+NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc pairhmm-template-main.cc pairhmm-1-base.cc utils.cc baseline.cc LoadTimeInitializer.cc
+#Use -xAVX for these files
+AVX_SOURCES=avx_function_instantiations.cc
+#Use -xSSE4.2 for these files
+SSE_SOURCES=sse_function_instantiations.cc
+
+NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
+AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
+SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
+$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
+$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
+$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
+OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)
+
 all: $(BIN)
 
 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
 
-checker: pairhmm-1-base.o convert_char.o
-	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
+checker: pairhmm-1-base.o $(COMMON_OBJECTS)
+	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
 
-pairhmm-template-main:	pairhmm-template-main.o convert_char.o
-	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)
+pairhmm-template-main:	pairhmm-template-main.o $(COMMON_OBJECTS)
+	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
 
 libJNILoglessPairHMM.so: $(LIBOBJECTS) 
-	$(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) 
+	$(CXX) $(OMPLFLAGS) -shared -o $@ $(LIBOBJECTS) ${LDFLAGS}
 
-%.o: %.cc
+
+$(OBJECTS): %.o: %.cc
 	@mkdir -p $(DEPDIR)
-	$(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+	$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
 
 
 clean:
diff --git a/PairHMM_JNI/avx_function_instantiations.cc b/PairHMM_JNI/avx_function_instantiations.cc
new file mode 100644
index 000000000..4118fc5cf
--- /dev/null
+++ b/PairHMM_JNI/avx_function_instantiations.cc
@@ -0,0 +1,19 @@
+#include "template.h"
+
+#undef SIMD_TYPE
+#undef SIMD_TYPE_SSE
+
+#define SIMD_TYPE avx
+#define SIMD_TYPE_AVX
+
+#include "define-float.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+template double compute_full_prob_avxd<double>(testcase* tc, double* nextlog);
+template float compute_full_prob_avxs<float>(testcase* tc, float* nextlog);
+
diff --git a/PairHMM_JNI/baseline.cc b/PairHMM_JNI/baseline.cc
new file mode 100644
index 000000000..b953c4436
--- /dev/null
+++ b/PairHMM_JNI/baseline.cc
@@ -0,0 +1,85 @@
+#include "headers.h"
+#include "template.h"
+
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
+{
+	int r, c;
+	int ROWS = tc->rslen + 1;
+	int COLS = tc->haplen + 1;
+
+	Context<NUMBER> ctx;
+
+	NUMBER M[MROWS][MCOLS];
+	NUMBER X[MROWS][MCOLS];
+	NUMBER Y[MROWS][MCOLS];
+	NUMBER p[MROWS][6];
+
+	p[0][MM] = ctx._(0.0);
+	p[0][GapM] = ctx._(0.0);
+	p[0][MX] = ctx._(0.0);
+	p[0][XX] = ctx._(0.0);
+	p[0][MY] = ctx._(0.0);
+	p[0][YY] = ctx._(0.0);
+	for (r = 1; r < ROWS; r++)
+	{
+		int _i = tc->i[r-1] & 127;
+		int _d = tc->d[r-1] & 127;
+		int _c = tc->c[r-1] & 127;
+		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
+		p[r][MX] = ctx.ph2pr[_i];
+		p[r][XX] = ctx.ph2pr[_c];
+		p[r][MY] = ctx.ph2pr[_d];
+		p[r][YY] = ctx.ph2pr[_c];
+		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+	}
+
+	for (c = 0; c < COLS; c++)
+	{
+		M[0][c] = ctx._(0.0);
+		X[0][c] = ctx._(0.0);
+		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
+	}
+
+	for (r = 1; r < ROWS; r++)
+	{
+		M[r][0] = ctx._(0.0);
+		X[r][0] = X[r-1][0] * p[r][XX];
+		Y[r][0] = ctx._(0.0);
+	}
+
+	 NUMBER result = ctx._(0.0);
+
+	for (r = 1; r < ROWS; r++)
+		for (c = 1; c < COLS; c++)
+		{
+			char _rs = tc->rs[r-1];
+			char _hap = tc->hap[c-1];
+			int _q = tc->q[r-1] & 127;
+			NUMBER distm = ctx.ph2pr[_q];
+			if (_rs == _hap || _rs == 'N' || _hap == 'N')
+				distm = ctx._(1.0) - distm;
+			else
+			  distm = distm/3;
+			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
+			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
+			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
+		}
+
+	for (c = 0; c < COLS; c++)
+	{
+		result += M[ROWS-1][c] + X[ROWS-1][c];
+	}
+
+	if (before_last_log != NULL)
+		*before_last_log = result;
+
+	return result;
+	//return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
+}
+
+template double compute_full_prob<double>(testcase* tc, double* nextbuf);
+template float compute_full_prob<float>(testcase* tc, float* nextbuf);
+
diff --git a/PairHMM_JNI/define-double.h b/PairHMM_JNI/define-double.h
index 5d31741d6..502b919fe 100644
--- a/PairHMM_JNI/define-double.h
+++ b/PairHMM_JNI/define-double.h
@@ -18,35 +18,36 @@
         #undef MASK_TYPE
         #undef MASK_ALL_ONES
 
-        #undef SET_VEC_ZERO
-        #undef VEC_OR
-        #undef VEC_ADD
-	#undef VEC_SUB
-        #undef VEC_MUL
-        #undef VEC_DIV
-        #undef VEC_BLEND
-        #undef VEC_BLENDV
-	#undef VEC_CAST_256_128
-	#undef VEC_EXTRACT_128
-	#undef VEC_EXTRACT_UNIT
-	#undef VEC_SET1_VAL128
-	#undef VEC_MOVE
-	#undef VEC_CAST_128_256
-	#undef VEC_INSERT_VAL
-	#undef VEC_CVT_128_256
-	#undef VEC_SET1_VAL
-	#undef VEC_POPCVT_CHAR
-	#undef VEC_LDPOPCVT_CHAR
-	#undef VEC_CMP_EQ
-	#undef VEC_SET_LSE
-	#undef SHIFT_HAP
-	#undef print256b
+        #undef SET_VEC_ZERO(__vec)
+        #undef VEC_OR(__v1, __v2)
+        #undef VEC_ADD(__v1, __v2)
+        #undef VEC_SUB(__v1, __v2)
+        #undef VEC_MUL(__v1, __v2)
+        #undef VEC_DIV(__v1, __v2)
+        #undef VEC_BLEND(__v1, __v2, __mask)
+        #undef VEC_BLENDV(__v1, __v2, __maskV)
+        #undef VEC_CAST_256_128(__v1)
+        #undef VEC_EXTRACT_128(__v1, __im)
+        #undef VEC_EXTRACT_UNIT(__v1, __im)
+        #undef VEC_SET1_VAL128(__val)
+        #undef VEC_MOVE(__v1, __val)
+        #undef VEC_CAST_128_256(__v1)
+        #undef VEC_INSERT_VAL(__v1, __val, __pos)
+        #undef VEC_CVT_128_256(__v1)
+        #undef VEC_SET1_VAL(__val)
+        #undef VEC_POPCVT_CHAR(__ch)
+        #undef VEC_LDPOPCVT_CHAR(__addr)
+        #undef VEC_CMP_EQ(__v1, __v2)
+        #undef VEC_SET_LSE(__val)
+        #undef SHIFT_HAP(__v1, __val)
+        #undef print256b(__v1)
         #undef MASK_VEC
-        #undef VEC_SSE_TO_AVX
-        #undef VEC_SHIFT_LEFT_1BIT
+        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+        #undef VEC_SHIFT_LEFT_1BIT(__vs)
         #undef MASK_ALL_ONES
-        #undef COMPARE_VECS
+        #undef COMPARE_VECS(__v1, __v2)
         #undef _256_INT_TYPE
+        #undef BITMASK_VEC
 #endif
 
 #define PRECISION d
@@ -83,7 +84,7 @@
 #define VEC_MUL(__v1, __v2)                     \
   _mm256_mul_pd(__v1, __v2)
 
-#define VEC_DIV(__v1, __v2)                     \
+#define VEC_DIV(__v1, __v2)			\
   _mm256_div_pd(__v1, __v2)
 
 #define VEC_BLEND(__v1, __v2, __mask)           \
@@ -156,3 +157,32 @@
       }									\
     }									\
   }
+
+class BitMaskVec_double {
+
+  MASK_VEC low_, high_ ;
+  _256_TYPE combined_ ;
+
+public:
+  
+  inline MASK_TYPE& getLowEntry(int index) {
+    return low_.masks[index] ;
+  }
+  inline MASK_TYPE& getHighEntry(int index) {
+    return high_.masks[index] ;
+  }
+  
+  inline const _256_TYPE& getCombinedMask() {
+    VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+
+    return combined_ ;
+  }
+  
+  inline void shift_left_1bit() {
+    VEC_SHIFT_LEFT_1BIT(low_.vec) ;
+    VEC_SHIFT_LEFT_1BIT(high_.vec) ;
+  }
+
+} ;
+
+#define BITMASK_VEC BitMaskVec_double
diff --git a/PairHMM_JNI/define-float.h b/PairHMM_JNI/define-float.h
index 0e2822b2d..3cc57ec38 100644
--- a/PairHMM_JNI/define-float.h
+++ b/PairHMM_JNI/define-float.h
@@ -47,6 +47,7 @@
         #undef MASK_ALL_ONES
         #undef COMPARE_VECS(__v1, __v2)
         #undef _256_INT_TYPE
+        #undef BITMASK_VEC
 #endif
 
 #define PRECISION s
@@ -84,7 +85,7 @@
 #define VEC_MUL(__v1, __v2)                     \
   _mm256_mul_ps(__v1, __v2)
 
-#define VEC_DIV(__v1, __v2)                     \
+#define VEC_DIV(__v1, __v2)			\
   _mm256_div_ps(__v1, __v2)
 
 #define VEC_BLEND(__v1, __v2, __mask)           \
@@ -133,7 +134,7 @@
   _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val);
 
 #define SHIFT_HAP(__v1, __val)			\
-  _vector_shift_lasts(__v1, __val.f);
+  _vector_shift_lastavxs(__v1, __val.f);
 
 #define print256b(__v1)                         \
   print256bFP(__v1)
@@ -157,3 +158,31 @@
     }									\
   }
 
+class BitMaskVec_float {
+
+  MASK_VEC low_, high_ ;
+  _256_TYPE combined_ ;
+
+public:
+  
+  inline MASK_TYPE& getLowEntry(int index) {
+    return low_.masks[index] ;
+  }
+  inline MASK_TYPE& getHighEntry(int index) {
+    return high_.masks[index] ;
+  }
+  
+  inline const _256_TYPE& getCombinedMask() {
+    VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ;
+
+    return combined_ ;
+  }
+  
+  inline void shift_left_1bit() {
+    VEC_SHIFT_LEFT_1BIT(low_.vec) ;
+    VEC_SHIFT_LEFT_1BIT(high_.vec) ;
+  }
+
+} ;
+
+#define BITMASK_VEC BitMaskVec_float
diff --git a/PairHMM_JNI/define-sse-double.h b/PairHMM_JNI/define-sse-double.h
new file mode 100644
index 000000000..a30b2e5f5
--- /dev/null
+++ b/PairHMM_JNI/define-sse-double.h
@@ -0,0 +1,153 @@
+#ifdef PRECISION
+        #undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+        #undef _256_TYPE
+        #undef AVX_LENGTH
+        #undef MAVX_COUNT
+        #undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+	#undef VEC_EXTRACT_UNIT(__v1, __im)
+	#undef VEC_INSERT_UNIT(__v1,__ins,__im)
+        #undef SET_VEC_ZERO(__vec)
+        #undef VEC_OR(__v1, __v2)
+        #undef VEC_ADD(__v1, __v2)
+        #undef VEC_SUB(__v1, __v2)
+        #undef VEC_MUL(__v1, __v2)
+	#undef VEC_DIV(__v1, __v2)
+        #undef VEC_BLEND(__v1, __v2, __mask)
+        #undef VEC_BLENDV(__v1, __v2, __maskV)
+        #undef VEC_CAST_256_128(__v1)
+        #undef VEC_EXTRACT_128(__v1, __im)
+        #undef VEC_EXTRACT_UNIT(__v1, __im)
+        #undef VEC_SET1_VAL128(__val)
+        #undef VEC_MOVE(__v1, __val)
+        #undef VEC_CAST_128_256(__v1)
+        #undef VEC_INSERT_VAL(__v1, __val, __pos)
+        #undef VEC_CVT_128_256(__v1)
+        #undef VEC_SET1_VAL(__val)
+        #undef VEC_POPCVT_CHAR(__ch)
+        #undef VEC_LDPOPCVT_CHAR(__addr)
+        #undef VEC_CMP_EQ(__v1, __v2)
+        #undef VEC_SET_LSE(__val)
+        #undef SHIFT_HAP(__v1, __val)
+        #undef print256b(__v1)
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+        #undef VEC_SHIFT_LEFT_1BIT(__vs)
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS(__v1, __v2)
+        #undef _256_INT_TYPE
+        #undef BITMASK_VEC
+#endif
+
+#define SSE
+#define PRECISION d
+
+#define MAIN_TYPE double
+#define MAIN_TYPE_SIZE 64
+#define UNION_TYPE mix_D128
+#define IF_128 IF_128d
+#define IF_MAIN_TYPE IF_64
+#define SHIFT_CONST1 1
+#define SHIFT_CONST2 8
+#define SHIFT_CONST3 0
+#define _128_TYPE __m128d
+#define _256_TYPE __m128d
+#define _256_INT_TYPE __m128i
+#define AVX_LENGTH 2
+#define MAVX_COUNT  (MROWS+3)/AVX_LENGTH
+#define HAP_TYPE __m128i
+#define MASK_TYPE uint64_t
+#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL
+#define MASK_VEC MaskVec_D
+
+#define VEC_EXTRACT_UNIT(__v1, __im)            \
+  _mm_extract_epi64(__v1, __im)
+
+#define VEC_INSERT_UNIT(__v1,__ins,__im)	\
+   _mm_insert_epi64(__v1,__ins,__im)
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm_or_pd(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm_add_pd(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm_sub_pd(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm_mul_pd(__v1, __v2)
+
+#define VEC_DIV(__v1, __v2)                     \
+  _mm_div_pd(__v1, __v2)
+
+#define VEC_CMP_EQ(__v1, __v2)                  \
+  _mm_cmpeq_pd(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm_blend_pd(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm_blendv_pd(__v1, __v2, __maskV)
+
+#define SHIFT_HAP(__v1, __val)                  \
+  __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0)
+
+#define VEC_CVT_128_256(__v1)                   \
+  _mm_cvtepi32_pd(__v1)
+
+#define VEC_SET1_VAL(__val)			\
+  _mm_set1_pd(__val)
+   
+#define VEC_POPCVT_CHAR(__ch)                   \
+  _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) 
+
+#define VEC_SET_LSE(__val)                      \
+  _mm_set_pd(zero, __val);  
+
+#define VEC_LDPOPCVT_CHAR(__addr)               \
+   _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) 
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
+   __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow))
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)               \
+  __vs = _mm_slli_epi64(__vs, 1)
+
+
+class BitMaskVec_sse_double {
+
+  MASK_VEC combined_ ;
+
+public:
+  
+  inline MASK_TYPE& getLowEntry(int index) {
+    return combined_.masks[index] ;
+  }
+  inline MASK_TYPE& getHighEntry(int index) {
+    return combined_.masks[AVX_LENGTH/2+index] ;
+  }
+  
+  inline const _256_TYPE& getCombinedMask() {
+    return combined_.vecf ;
+  }
+  
+  inline void shift_left_1bit() {
+    VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
+  }
+
+} ;
+
+#define BITMASK_VEC BitMaskVec_sse_double
+
diff --git a/PairHMM_JNI/define-sse-float.h b/PairHMM_JNI/define-sse-float.h
new file mode 100644
index 000000000..6612b28e6
--- /dev/null
+++ b/PairHMM_JNI/define-sse-float.h
@@ -0,0 +1,151 @@
+#ifdef PRECISION
+        #undef PRECISION
+        #undef MAIN_TYPE
+        #undef MAIN_TYPE_SIZE
+        #undef UNION_TYPE
+        #undef IF_128
+        #undef IF_MAIN_TYPE
+        #undef SHIFT_CONST1
+        #undef SHIFT_CONST2
+        #undef SHIFT_CONST3
+        #undef _128_TYPE
+        #undef _256_TYPE
+        #undef AVX_LENGTH
+        #undef MAVX_COUNT
+        #undef HAP_TYPE
+        #undef MASK_TYPE
+        #undef MASK_ALL_ONES
+
+	#undef VEC_EXTRACT_UNIT(__v1, __im)
+	#undef VEC_INSERT_UNIT(__v1,__ins,__im)
+        #undef SET_VEC_ZERO(__vec)
+        #undef VEC_OR(__v1, __v2)
+        #undef VEC_ADD(__v1, __v2)
+        #undef VEC_SUB(__v1, __v2)
+        #undef VEC_MUL(__v1, __v2)
+	#undef VEC_DIV(__v1, __v2)
+        #undef VEC_BLEND(__v1, __v2, __mask)
+        #undef VEC_BLENDV(__v1, __v2, __maskV)
+        #undef VEC_CAST_256_128(__v1)
+        #undef VEC_EXTRACT_128(__v1, __im)
+        #undef VEC_EXTRACT_UNIT(__v1, __im)
+        #undef VEC_SET1_VAL128(__val)
+        #undef VEC_MOVE(__v1, __val)
+        #undef VEC_CAST_128_256(__v1)
+        #undef VEC_INSERT_VAL(__v1, __val, __pos)
+        #undef VEC_CVT_128_256(__v1)
+        #undef VEC_SET1_VAL(__val)
+        #undef VEC_POPCVT_CHAR(__ch)
+        #undef VEC_LDPOPCVT_CHAR(__addr)
+        #undef VEC_CMP_EQ(__v1, __v2)
+        #undef VEC_SET_LSE(__val)
+        #undef SHIFT_HAP(__v1, __val)
+        #undef print256b(__v1)
+        #undef MASK_VEC
+        #undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
+        #undef VEC_SHIFT_LEFT_1BIT(__vs)
+        #undef MASK_ALL_ONES
+        #undef COMPARE_VECS(__v1, __v2)
+        #undef _256_INT_TYPE
+        #undef BITMASK_VEC
+#endif
+
+#define SSE
+#define PRECISION s
+
+#define MAIN_TYPE float
+#define MAIN_TYPE_SIZE 32
+#define UNION_TYPE mix_F128
+#define IF_128 IF_128f
+#define IF_MAIN_TYPE IF_32
+#define SHIFT_CONST1 3
+#define SHIFT_CONST2 4
+#define SHIFT_CONST3 0
+#define _128_TYPE __m128
+#define _256_TYPE __m128
+#define _256_INT_TYPE __m128i
+#define AVX_LENGTH 4
+#define MAVX_COUNT  (MROWS+3)/AVX_LENGTH
+#define HAP_TYPE UNION_TYPE
+#define MASK_TYPE uint32_t
+#define MASK_ALL_ONES 0xFFFFFFFF
+#define MASK_VEC MaskVec_F
+
+#define VEC_EXTRACT_UNIT(__v1, __im)            \
+  _mm_extract_epi32(__v1, __im)
+
+#define VEC_INSERT_UNIT(__v1,__ins,__im)	\
+   _mm_insert_epi32(__v1,__ins,__im)
+
+#define VEC_OR(__v1, __v2)                      \
+  _mm_or_ps(__v1, __v2)
+
+#define VEC_ADD(__v1, __v2)                     \
+  _mm_add_ps(__v1, __v2)
+
+#define VEC_SUB(__v1, __v2)                     \
+  _mm_sub_ps(__v1, __v2)
+
+#define VEC_MUL(__v1, __v2)                     \
+  _mm_mul_ps(__v1, __v2)
+
+#define VEC_DIV(__v1, __v2)                     \
+  _mm_div_ps(__v1, __v2)
+
+#define VEC_CMP_EQ(__v1, __v2)                  \
+  _mm_cmpeq_ps(__v1, __v2)
+
+#define VEC_BLEND(__v1, __v2, __mask)           \
+  _mm_blend_ps(__v1, __v2, __mask)
+
+#define VEC_BLENDV(__v1, __v2, __maskV)         \
+  _mm_blendv_ps(__v1, __v2, __maskV)
+
+#define SHIFT_HAP(__v1, __val)                  \
+  _vector_shift_lastsses(__v1, __val.f)
+
+#define VEC_CVT_128_256(__v1)                   \
+  _mm_cvtepi32_ps(__v1.i)
+
+#define VEC_SET1_VAL(__val)			\
+  _mm_set1_ps(__val)
+   
+#define VEC_POPCVT_CHAR(__ch)                   \
+  _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) 
+
+#define VEC_SET_LSE(__val)                      \
+  _mm_set_ps(zero, zero, zero, __val);  
+
+#define VEC_LDPOPCVT_CHAR(__addr)               \
+   _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) 
+
+#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)       \
+   __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
+
+#define VEC_SHIFT_LEFT_1BIT(__vs)               \
+  __vs = _mm_slli_epi32(__vs, 1)
+
+class BitMaskVec_sse_float {
+
+  MASK_VEC combined_ ;
+
+public:
+  
+  inline MASK_TYPE& getLowEntry(int index) {
+    return combined_.masks[index] ;
+  }
+  inline MASK_TYPE& getHighEntry(int index) {
+    return combined_.masks[AVX_LENGTH/2+index] ;
+  }
+  
+  inline const _256_TYPE& getCombinedMask() {
+    return combined_.vecf ;
+  }
+  
+  inline void shift_left_1bit() {
+    VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
+  }
+
+} ;
+
+#define BITMASK_VEC BitMaskVec_sse_float
diff --git a/PairHMM_JNI/headers.h b/PairHMM_JNI/headers.h
new file mode 100644
index 000000000..f502ff5ce
--- /dev/null
+++ b/PairHMM_JNI/headers.h
@@ -0,0 +1,25 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+#include <ctype.h>
+
+#include <sys/time.h>
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <map>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+
diff --git a/PairHMM_JNI/hmm_mask.cc b/PairHMM_JNI/hmm_mask.cc
deleted file mode 100644
index c335a0160..000000000
--- a/PairHMM_JNI/hmm_mask.cc
+++ /dev/null
@@ -1,451 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-
-
-// /usr/intel/pkgs/icc/13.0.0e/bin/icc -o ed -O3 ed.cpp -xAVX  -openmp -openmp-link static
-
-
-#include <iostream>
-
-#include <malloc.h>
-#include <assert.h>
-#include <sys/time.h>
-#include <omp.h>
-#include <stdlib.h>
-#include <math.h>
-
-
-#include <immintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include <smmintrin.h>
-
-
-#include <stdint.h>
-#include <algorithm>
-#include <vector>
-#include <set>
-#include <map>
-#include <memory.h>
-#include "template.h"
-
-using namespace std ;
-
-#define CHECK_MASK_CORRECTNESS
-
-template <class T>
-string getBinaryStr (T val, int numBitsToWrite) {
-  
-  ostringstream oss ;
-  uint64_t mask = ((T) 0x1) << (numBitsToWrite-1) ;
-  for (int i=numBitsToWrite-1; i >= 0; --i) {
-    oss << ((val & mask) >> i) ;
-    mask >>= 1 ;
-  }
-  return oss.str() ;
-}
-
-int normalize(char c)
-{
-	return ((int) (c - 33));
-}
-
-
-uint8_t ConvertChar::conversionTable[255] ;
-
-int read_testcase(testcase *tc, FILE* ifp)
-{
-	char *q, *i, *d, *c, *line = NULL;
-	int _q, _i, _d, _c;
-	int x, size = 0;
-	ssize_t read;
-
-	read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp);
-	if (read == -1)
-		return -1;
-
-
-	tc->hap = (char *) malloc(size);
-	tc->rs = (char *) malloc(size);
-	q = (char *) malloc(size);
-	i = (char *) malloc(size);
-	d = (char *) malloc(size);
-	c = (char *) malloc(size);
-
-	if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6)
-		return -1;
-
-	tc->haplen = strlen(tc->hap);
-	tc->rslen = strlen(tc->rs);
-	assert(tc->rslen < MROWS);
-	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
-	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
-
-	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
-
-	for (x = 0; x < tc->rslen; x++)
-	{
-		_q = normalize(q[x]);
-		_i = normalize(i[x]);
-		_d = normalize(d[x]);
-		_c = normalize(c[x]);
-		tc->q[x] = (_q < 6) ? 6 : _q;
-		tc->i[x] = _i;
-		tc->d[x] = _d;
-		tc->c[x] = _c;
-		tc->irs[x] = tc->rs[x];
-	}
-	for (x = 0; x < tc->haplen; x++)
-	  tc->ihap[x] = tc->hap[x];
-
-
-	free(q);
-	free(i);
-	free(d);
-	free(c);
-	free(line);
-
-	return 0;
-}
-
-
-
-
-#define ALIGN __attribute__ ((aligned(32)))
-
-
-typedef union ALIGN { 
-  //__m256i vi; 
-  __m128 vf ;
-  __m128i vi ;
-  __m128d vd; 
-
-  uint8_t b[16];
-  //uint16_t hw[8] ;
-  uint32_t w[4] ;
-  uint64_t dw[2] ;
-  
-  float f[4] ;
-  //double d[2] ;
-
-} v128 ;
-
-typedef union ALIGN { 
-  __m256i vi; 
-  __m256 vf ;
-  __m256d vd; 
-  //__m128i vi128[2] ;
-
-  uint8_t b[32];
-  //uint16_t hw[16] ;
-  uint32_t w[8] ;
-  uint64_t dw[4] ;
-  
-  float f[8] ;
-  double d[4] ;
-
-} v256 ;
-
-
-#define NUM_DISTINCT_CHARS 5
-#define AMBIG_CHAR 4
-
-#define VEC_ENTRY_CNT 8
-#define VEC_LEN 256
-#define VTYPE vf
-#define VTYPEI vi
-#define VENTRY f
-#define VENTRYI w
-#define MTYPE uint32_t
-#define MASK_ALL_ONES 0xFFFFFFFF
-
-
-#define VECTOR v256
-#define VECTOR_SSE v128
-
-#define SET_VEC_ZERO(__vec)			\
-  __vec= _mm256_setzero_ps()
-
-#define SET_VEC_ONES(__vec)			\
-  __vec = _mm256_set1_epi32(0xFFFFFFFF)
-
-#define VEC_OR(__v1, __v2)			\
-  _mm256_or_ps(__v1, __v2)
-
-#define VEC_ADD(__v1, __v2)			\
-  _mm256_add_ps(__v1, __v2)
-
-#define VEC_MUL(__v1, __v2)			\
-  _mm256_mul_ps(__v1, __v2) 
-
-#define VEC_BLENDV(__v1, __v2, __maskV)		\
-  _mm256_blendv_ps(__v1, __v2, __maskV)
-
-#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)	\
-  __vdst = _mm256_castps128_ps256(__vsLow) ;		\
-  __vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ;
-
-#define VECS_SHIFT_LEFT_1BIT(__vs)		\
-  __vs = _mm_slli_epi32(__vs, 1) 
-
-
-#define VECS_SHIFT_RIGHT_WHOLE(__vs, __cnt) {				\
-    uint64_t __mask = ;							\
-    uint64_t __shiftWord = ((((uint64_t) 0x1) << __cnt) - 1 ) & __vs.dw[1] ; \
-    __shiftWord <<= (64-cnt) ;						\
-    __vs = _mm_slri_epi64(__vs, __cnt) ;				\
-    __vs.dw[0] |= __shiftWord ;						\
-  }
-
-
-#define GET_MASK_WORD_OLD(__mask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
-    MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
-    MTYPE __nextShiftOut = (__mask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
-    __mask >>= __shiftBy ;						\
-    __mask |= __lastShiftOut ;						\
-    __lastShiftOut = __nextShiftOut ;					\
-  }
-
-
-#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
-  MTYPE __bitMask = (((MTYPE)0x1) << __shiftBy) - 1 ;			\
-  MTYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
-  __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ;		\
-  __lastShiftOut = __nextShiftOut ;					\ 
-}
-
-
-void precompute_masks_avx(const testcase& tc, int COLS, int numMaskVecs,
-			  MTYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
-
-  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
-
-  for (int vi=0; vi < numMaskVecs; ++vi) {
-    for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) {
-      maskArr[vi][rs] = 0 ;
-    }
-    maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ;
-  }
- 
-  for (int col=1; col < COLS; ++col) {
-    int mIndex = (col-1) / maskBitCnt ;
-    int mOffset = (col-1) % maskBitCnt ;
-    MTYPE bitMask = ((MTYPE)0x1) << (maskBitCnt-1-mOffset) ;
-
-    char hapChar = tc.hap[col-1] ;
-
-    if (hapChar == AMBIG_CHAR) {
-      maskArr[mIndex][0] |= bitMask ;
-      maskArr[mIndex][1] |= bitMask ;
-      maskArr[mIndex][2] |= bitMask ;
-      maskArr[mIndex][3] |= bitMask ;
-    } 
-
-    //cout << hapChar << " " << mIndex << " " << getBinaryStr<MTYPE>(bitMask, 32)
-    // << endl ;
-    //cout << getBinaryStr<MTYPE>(maskArr[0][hapChar],32) << endl ;
-    //exit(0) ;
-		     
-    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
-
-
-    // bit corresponding to col 1 will be the MSB of the mask 0
-    // bit corresponding to col 2 will be the MSB-1 of the mask 0
-    // ...
-    // bit corresponding to col 32 will be the LSB of the mask 0
-    // bit corresponding to col 33 will be the MSB of the mask 1
-    // ...
-  }
-
-}
-
-
-void test_mask_computations (testcase& tc, int tcID, bool printDebug=false) {
-
-  int ROWS = tc.rslen + 1 ;
-  int COLS = tc.haplen + 1 ;
-
-  // only for testing
-  VECTOR mismatchData, matchData ;
-  //SET_VEC_ZERO(mismatchData.VTYPE) ;
-  //SET_VEC_ONES(matchData.VTYPEI) ;
-  for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) {
-    matchData.VENTRY[ei] = 1.0 ;
-    mismatchData.VENTRY[ei] = 0.0 ;
-  }
-  
-  const int maskBitCnt = VEC_LEN / VEC_ENTRY_CNT ;
-  const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
-
-  MTYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
-  precompute_masks_avx(tc, COLS, numMaskVecs, maskArr) ;
-
-#ifdef DEBUG 
-  if (printDebug) {
-    cout << "The first 32 hap chars are: " ;
-    for (int i=0; i < 32; ++i) {
-      cout << tc.hap[i] ;
-    }
-    cout << endl ;
-
-    cout << "Masks computed for A, C, T, G, N are: "  << endl ;
-    cout << getBinaryStr<MTYPE>(maskArr[0][0], 32)  << endl ;
-    cout << getBinaryStr<MTYPE>(maskArr[0][1], 32)  << endl ;
-    cout << getBinaryStr<MTYPE>(maskArr[0][2], 32)  << endl ;
-    cout << getBinaryStr<MTYPE>(maskArr[0][3], 32)  << endl ;
-    cout << getBinaryStr<MTYPE>(maskArr[0][4], 32)  << endl ;
-  }
-#endif // #ifdef DEBUG 
-
-  int beginRowIndex = 1 ;
-  while (beginRowIndex < ROWS) {
-
-    int numRowsToProcess = min(VEC_ENTRY_CNT, ROWS - beginRowIndex) ;
-
-    char rsArr[VEC_ENTRY_CNT] ;
-    for (int ri=0; ri < numRowsToProcess; ++ri) {
-      rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
-    }
-
-    // Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
-    VECTOR_SSE currMaskVecLow ; // corresponding to entries 0-3
-    VECTOR_SSE currMaskVecHigh ; // corresponding to entries 4-7
-
-    MTYPE lastMaskShiftOut[VEC_ENTRY_CNT] ;
-    for (int ei=0; ei < VEC_ENTRY_CNT; ++ei) 
-      lastMaskShiftOut[ei] = 0 ;
-
-    int col = 1 ;
-    int diag = 1 ;
-    for (int maskIndex=0; maskIndex < numMaskVecs; ++maskIndex) {
-      // set up the mask vectors for the next maskBitCnt columns
-
-      // For AVX, maskBitCnt = 32 (so, the operation below is amortized over 32 cols)
-      for (int ei=0; ei < VEC_ENTRY_CNT/2; ++ei) {
-	SET_MASK_WORD(currMaskVecLow.VENTRYI[ei], maskArr[maskIndex][rsArr[ei]], 
-		      lastMaskShiftOut[ei], ei, maskBitCnt) ;
-
-      	int ei2 = ei + VEC_ENTRY_CNT/2 ; // the second entry index
-	SET_MASK_WORD(currMaskVecHigh.VENTRYI[ei], maskArr[maskIndex][rsArr[ei2]], 
-		      lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
-      }
-
-#ifdef DEBUG 
-      if (printDebug && maskIndex == 0) {
-	cout << "The masks for entry 1: " << endl  
-	     << getBinaryStr<MTYPE>(maskArr[0][rsArr[1]], 32) << endl
-	     << getBinaryStr<MTYPE>(currMaskVecLow.VENTRYI[1], 32) << endl ;
-
-      }
-#endif // #ifdef DEBUG 
-      
-      // iterate over mask bit indices and columns
-      for (int mbi=0; mbi < maskBitCnt && diag < COLS + ROWS -2; ++mbi, ++diag) {
-
-	VECTOR maskV ;
-	VEC_SSE_TO_AVX(currMaskVecLow.VTYPE, currMaskVecHigh.VTYPE, maskV.VTYPE) ;
-
-	VECTOR testData ;
-	testData.VTYPE = VEC_BLENDV(mismatchData.VTYPE, matchData.VTYPE,
-				    maskV.VTYPE) ;
-
-	VECS_SHIFT_LEFT_1BIT(currMaskVecLow.VTYPEI) ;
-	VECS_SHIFT_LEFT_1BIT(currMaskVecHigh.VTYPEI) ;
-
-#ifdef DEBUG 
-	if (printDebug && maskIndex == 0) {
-	  cout << "The mask for entry 1, mbi=" << mbi << ": "
-	       << getBinaryStr<MTYPE>(maskV.VENTRYI[1], 32) << endl ;
-
-	}
-#endif // #ifdef DEBUG 
-
-#ifdef CHECK_MASK_CORRECTNESS
-
-	int firstRowIndex = (diag < COLS) ? 0 : (diag - COLS) ;
-	int lastRowIndex = min(col-1, numRowsToProcess-1) ;
-
-	for (int ri=firstRowIndex; ri <= lastRowIndex; ++ri) {
-	  int currRow = beginRowIndex + ri ;
-	  int currCol = col - ri + firstRowIndex ;
-
-	  char hapChar = tc.hap[currCol-1] ;
-	  char rsChar = tc.rs[currRow-1] ;
-
-	  bool match = (hapChar == rsChar || hapChar == 'N' || rsChar == 'N') ;
-
-	  if ((bool) testData.VENTRYI[ri] != match) {
-	    cout << "Error: Incorrect mask for tc " << tcID << ", diag = " << diag
-		 << " (" << currRow  << ", " << currCol << ")" << endl ;
-
-	    cout << "The chars are: " << hapChar << " and " << rsChar << endl ;
-	    cout << "The selected value is: " << testData.VENTRYI[ri] << endl ;
-
-	    exit(0) ;
-	  }
-	}
-#endif // #ifdef CHECK_MASK_CORRECTNESS
-
-	if (diag < COLS)
-	  ++col ;
-
-      } // mbi
-    } // maskIndex
-
-    beginRowIndex += VEC_ENTRY_CNT ;
-  } // end of stripe
-
-  //cout << "Finished validating entry " << endl ;
-}
-
-#ifdef HMM_MASK_MAIN
-int main () {
-
-  #define BATCH_SIZE 10000
-
-  ConvertChar::init() ;
-  
-  testcase* tcBatch = new testcase[BATCH_SIZE] ;
-  
-  int numBatches = 0 ;
-  int numRead = 0 ;
-
-  const int DEBUG_TC = -1 ;
-
-  FILE* ifp = stdin ;
-
-  do {
-    numRead = 0 ;
-
-    while (numRead < BATCH_SIZE) {
-      if (read_testcase(tcBatch+numRead, ifp) < 0)
-	break ;
-      
-      ++numRead ;
-    }
-
-    if (numRead == 0)
-      break ;
-
-    for (int ti=0; ti < numRead; ++ti) {
-
-      int tcID = numBatches * BATCH_SIZE + ti ;
-      test_mask_computations(tcBatch[ti], tcID, tcID == DEBUG_TC) ;
-    }
-
-    ++numBatches ;
-  } while (numRead == BATCH_SIZE) ;
-  
-  fclose(ifp) ;
-
-  delete[] tcBatch ;
-
-  return 0 ;
-}
-#endif
diff --git a/PairHMM_JNI/libJNILoglessPairHMM.so b/PairHMM_JNI/libJNILoglessPairHMM.so
deleted file mode 100755
index 435034f78..000000000
Binary files a/PairHMM_JNI/libJNILoglessPairHMM.so and /dev/null differ
diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
index 59d00928e..d575b8271 100644
--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@@ -1,70 +1,19 @@
+#include "headers.h"
 #include <jni.h>
-#include <assert.h>
-#include <stdio.h>
 #include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
-#include <vector>
 
-#include <cstdio>
-#include <string>
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-
-#include <immintrin.h>
-#include <emmintrin.h>
-#include <omp.h>
-using namespace std;
 
 #define ENABLE_ASSERTIONS 1
+#define DO_PROFILING 1
 //#define DEBUG 1
 //#define DEBUG0_1 1
 //#define DEBUG3 1
 
-
 #include "template.h"
+#include "utils.h"
+#include "LoadTimeInitializer.h"
 
-#include "define-double.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
-
-
-
-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
-
-class LoadTimeInitializer
-{
-  public:
-    LoadTimeInitializer()		//will be called when library is loaded
-    {
-      ConvertChar::init();
-    }
-    jfieldID m_readBasesFID;
-    jfieldID m_readQualsFID;
-    jfieldID m_insertionGOPFID;
-    jfieldID m_deletionGOPFID;
-    jfieldID m_overallGCPFID;
-    jfieldID m_haplotypeBasesFID;
-};
-LoadTimeInitializer g_load_time_initializer;
-
-void debug_dump(string filename, string s, bool to_append, bool add_newline)
-{
-  ofstream fptr;
-  fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
-  fptr << s;
-  if(add_newline)
-    fptr << "\n";
-  fptr.close();
-}
+using namespace std;
 
 JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
 (JNIEnv* env, jclass thisObject,
@@ -93,7 +42,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior
 //Gets direct access to Java arrays
 #define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
 #define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
-#define JNI_RELEASE_MODE JNI_ABORT
+#define JNI_RO_RELEASE_MODE JNI_ABORT
 #define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical
 #define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical
 
@@ -101,7 +50,7 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePrior
 //Likely makes copy of Java arrays to JNI C++ space
 #define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements
 #define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements
-#define JNI_RELEASE_MODE JNI_ABORT
+#define JNI_RO_RELEASE_MODE JNI_ABORT
 #define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements
 #define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements
 
@@ -145,19 +94,19 @@ Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadL
     tc.c[i] = (int)overallGCPArray[i];
   }
 
-  double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
+  double result_avxd = g_compute_full_prob_double(&tc, 0);
   double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
 #ifdef DEBUG
-  debug_dump("return_values_jni.txt",to_string(result),true);
+  g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(result),true);
 #endif
 
 
-  RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);
-  RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
-  RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
-  RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
-  RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RELEASE_MODE);
-  RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RO_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RO_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RO_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RO_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBases, haplotypeBasesArray, JNI_RO_RELEASE_MODE);
+  RELEASE_BYTE_ARRAY_ELEMENTS(readBases, readBasesArray, JNI_RO_RELEASE_MODE);
 
   return 0.0;
 }
@@ -191,6 +140,46 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
   g_load_time_initializer.m_haplotypeBasesFID = fid;
 }
 
+//Since the list of haplotypes against which the reads are evaluated in PairHMM is the same for a region,
+//transfer the list only once
+vector<pair<jbyteArray, jbyte*> > g_haplotypeBasesArrayVector;
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes
+  (JNIEnv * env, jobject thisObject, jint numHaplotypes, jobjectArray haplotypeDataArray)
+{
+  jboolean is_copy = JNI_FALSE;
+  //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
+  vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
+  haplotypeBasesArrayVector.clear();
+  haplotypeBasesArrayVector.resize(numHaplotypes);
+  for(unsigned j=0;j<numHaplotypes;++j)
+  {
+    jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
+    jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
+#ifdef ENABLE_ASSERTIONS
+    assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
+#endif
+    //Need a global reference as this will be accessed across multiple JNI calls to JNIComputeLikelihoods()
+    jbyteArray haplotypeBasesGlobalRef = (jbyteArray)env->NewGlobalRef(haplotypeBases);
+#ifdef ENABLE_ASSERTIONS
+    assert(haplotypeBasesGlobalRef && ("Could not get global ref to haplotypeBases at index : "+to_string(j)+"\n").c_str());
+#endif
+    env->DeleteLocalRef(haplotypeBases);	//free the local reference
+    jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBasesGlobalRef, &is_copy);
+#ifdef ENABLE_ASSERTIONS
+    assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
+    assert(env->GetArrayLength(haplotypeBasesGlobalRef) < MCOLS);
+#endif
+#ifdef DEBUG0_1
+    cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBasesGlobalRef)<<"\n";
+#endif
+    haplotypeBasesArrayVector[j] = make_pair(haplotypeBasesGlobalRef, haplotypeBasesArray);
+#ifdef DEBUG3
+    for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
+      g_load_time_initializer.debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
+#endif
+  }
+}
+
 //JNI function to invoke compute_full_prob_avx
 //readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc
 //haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases
@@ -200,37 +189,16 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
   (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, 
    jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse)
 {
-  jboolean is_copy = JNI_FALSE;
-  //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector
-  vector<pair<jbyteArray, jbyte*> > haplotypeBasesArrayVector;
-  haplotypeBasesArrayVector.clear();
-  haplotypeBasesArrayVector.resize(numHaplotypes);
 #ifdef DEBUG0_1
   cout << "JNI numReads "<<numReads<<" numHaplotypes "<<numHaplotypes<<"\n";
 #endif
-  for(unsigned j=0;j<numHaplotypes;++j)
-  {
-    jobject haplotypeObject = env->GetObjectArrayElement(haplotypeDataArray, j);
-    jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID);
-#ifdef ENABLE_ASSERTIONS
-    assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str());
-#endif
-    jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy);
-#ifdef ENABLE_ASSERTIONS
-    assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); 
-    assert(env->GetArrayLength(haplotypeBases) < MCOLS);
-#endif
-#ifdef DEBUG0_1
-    cout << "JNI haplotype length "<<env->GetArrayLength(haplotypeBases)<<"\n";
-#endif
-    haplotypeBasesArrayVector[j] = make_pair(haplotypeBases, haplotypeBasesArray);
-#ifdef DEBUG3
-    for(unsigned k=0;k<env->GetArrayLength(haplotypeBases);++k)
-      debug_dump("haplotype_bases_jni.txt",to_string((int)haplotypeBasesArray[k]),true);
-#endif
-  }
+  double start_time = 0;
+  //haplotype vector from earlier store - note the reference to vector, not copying
+  vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
+  jboolean is_copy = JNI_FALSE;
 
   unsigned numTestCases = numReads*numHaplotypes;
+  //vector to store results
   vector<testcase> tc_array;
   tc_array.clear();
   tc_array.resize(numTestCases);
@@ -239,6 +207,9 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
   vector<vector<pair<jbyteArray,jbyte*> > > readBasesArrayVector;
   readBasesArrayVector.clear();
   readBasesArrayVector.resize(numReads);
+#ifdef DO_PROFILING
+  start_time = get_time();
+#endif
   for(unsigned i=0;i<numReads;++i)
   {
     //Get bytearray fields from read
@@ -281,11 +252,11 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
 #ifdef DEBUG3
     for(unsigned j=0;j<readLength;++j)
     {
-      debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
-      debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
-      debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
-      debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
-      debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
+      g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)readBasesArray[j]),true);
+      g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)readQualsArray[j]),true);
+      g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)insertionGOPArray[j]),true);
+      g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)deletionGOPArray[j]),true);
+      g_load_time_initializer.debug_dump("reads_jni.txt",to_string((int)overallGCPArray[j]),true);
     }
 #endif
 
@@ -295,63 +266,115 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
       jbyte* haplotypeBasesArray = haplotypeBasesArrayVector[j].second;
       tc_array[tc_idx].rslen = (int)readLength;
       tc_array[tc_idx].haplen = (int)haplotypeLength;
-      tc_array[tc_idx].rs = (char*)readBasesArray;
       tc_array[tc_idx].hap = (char*)haplotypeBasesArray;
-      //Can be avoided 
-      for(unsigned k=0;k<readLength;++k)
-      {
-	tc_array[tc_idx].q[k] = (int)readQualsArray[k];
-	tc_array[tc_idx].i[k] = (int)insertionGOPArray[k];
-	tc_array[tc_idx].d[k] = (int)deletionGOPArray[k];
-	tc_array[tc_idx].c[k] = (int)overallGCPArray[k];
-      }
-
+      tc_array[tc_idx].rs = (char*)readBasesArray;
+      tc_array[tc_idx].q = (char*)readQualsArray;
+      tc_array[tc_idx].i = (char*)insertionGOPArray;
+      tc_array[tc_idx].d = (char*)deletionGOPArray;
+      tc_array[tc_idx].c = (char*)overallGCPArray;
       ++tc_idx;  
     }
-    RELEASE_BYTE_ARRAY_ELEMENTS(overallGCP, overallGCPArray, JNI_RELEASE_MODE);	//order of GET-RELEASE is important
-    RELEASE_BYTE_ARRAY_ELEMENTS(deletionGOP, deletionGOPArray, JNI_RELEASE_MODE);
-    RELEASE_BYTE_ARRAY_ELEMENTS(insertionGOP, insertionGOPArray, JNI_RELEASE_MODE);
-    RELEASE_BYTE_ARRAY_ELEMENTS(readQuals, readQualsArray, JNI_RELEASE_MODE);
-
-    //Release readBases at end because it is used by compute_full_prob
+    //Release read arrays at end because they are used by compute_full_prob
+    //Maintain order in which GET_BYTE_ARRAY_ELEMENTS called
     readBasesArrayVector[i].clear();
-    readBasesArrayVector[i].resize(1);
+    readBasesArrayVector[i].resize(5);
     readBasesArrayVector[i][0] = make_pair(readBases, readBasesArray);
+    readBasesArrayVector[i][1] = make_pair(readQuals, readQualsArray);
+    readBasesArrayVector[i][2] = make_pair(insertionGOP, insertionGOPArray);
+    readBasesArrayVector[i][3] = make_pair(deletionGOP, deletionGOPArray);
+    readBasesArrayVector[i][4] = make_pair(overallGCP, overallGCPArray);
   }
+#ifdef DO_PROFILING
+  g_load_time_initializer.m_data_transfer_time += get_time();
+#endif
 
   jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy);
 #ifdef ENABLE_ASSERTIONS
   assert(likelihoodDoubleArray && "likelihoodArray is NULL");
   assert(env->GetArrayLength(likelihoodArray) == numTestCases);
 #endif
+#ifdef DO_PROFILING
+  start_time = get_time();
+#endif
 #pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) 
   for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
   {
-    double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
-    double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+    float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0);
+    double result = 0;
+    if (result_avxf < MIN_ACCEPTED) {
+      double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0);
+      result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+    }
+    else
+      result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
+
     likelihoodDoubleArray[tc_idx] = result;
   }
+#ifdef DO_PROFILING
+  g_load_time_initializer.m_compute_time += get_time();
+#endif
 #ifdef DEBUG
   for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
   {
-    debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
+    g_load_time_initializer.debug_dump("return_values_jni.txt",to_string(likelihoodDoubleArray[tc_idx]),true);
   }
 #endif
-  RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0);
-
+#ifdef DO_PROFILING
+  start_time = get_time();
+#endif
+  RELEASE_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, likelihoodDoubleArray, 0); //release mode 0, copy back results to Java memory
+  
   //Release read arrays first
   for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET
   {
-    for(int j=readBasesArrayVector.size()-1;j>=0;--j)
-      RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RELEASE_MODE);
+    for(int j=readBasesArrayVector[i].size()-1;j>=0;--j)
+      RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE);
     readBasesArrayVector[i].clear();
   }
   readBasesArrayVector.clear();
-
-  //Now release haplotype arrays
-  for(int j=numHaplotypes-1;j>=0;--j)	//note the order - reverse of GET
-    RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE);
-  haplotypeBasesArrayVector.clear();
+#ifdef DO_PROFILING
+  g_load_time_initializer.m_data_transfer_time += get_time();
+#endif
   tc_array.clear();
+#ifdef DO_PROFILING
+  g_load_time_initializer.m_sumNumReads += numReads;
+  g_load_time_initializer.m_sumSquareNumReads += numReads*numReads;
+  g_load_time_initializer.m_sumNumHaplotypes += numHaplotypes;
+  g_load_time_initializer.m_sumSquareNumHaplotypes += numHaplotypes*numHaplotypes;
+  g_load_time_initializer.m_sumNumTestcases += numTestCases;
+  g_load_time_initializer.m_sumSquareNumTestcases += numTestCases*numTestCases;
+  g_load_time_initializer.m_maxNumTestcases = numTestCases > g_load_time_initializer.m_maxNumTestcases ? numTestCases
+    : g_load_time_initializer.m_maxNumTestcases;
+  ++(g_load_time_initializer.m_num_invocations);
+#endif
+#ifdef DEBUG
+  g_load_time_initializer.debug_close();
+#endif
+}
+
+//Release haplotypes at the end of a region
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion
+  (JNIEnv * env, jobject thisObject)
+{
+  vector<pair<jbyteArray, jbyte*> >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector;
+  //Now release haplotype arrays
+  for(int j=haplotypeBasesArrayVector.size()-1;j>=0;--j)	//note the order - reverse of GET
+  {
+    RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RO_RELEASE_MODE);
+    env->DeleteGlobalRef(haplotypeBasesArrayVector[j].first);	//free the global reference
+  }
+  haplotypeBasesArrayVector.clear(); 
+}
+
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose
+  (JNIEnv* env, jobject thisObject)
+{
+#ifdef DO_PROFILING
+  g_load_time_initializer.print_profiling();
+#endif
+#ifdef DEBUG
+  g_load_time_initializer.debug_close();
+#endif
 }
 
diff --git a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
index 1fff9fa5a..06ee99d7d 100644
--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h
@@ -21,6 +21,34 @@ extern "C" {
 #define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_matchToDeletion 4L
 #undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion
 #define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_deletionToDeletion 5L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_verify 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug0_1 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug1 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug2 0L
+#undef org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3
+#define org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_debug3 0L
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniGlobalInit
+ * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
+  (JNIEnv *, jobject, jclass, jclass);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniClose
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniClose
+  (JNIEnv *, jobject);
+
 /*
  * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
  * Method:    jniInitialize
@@ -45,27 +73,34 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
 JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializePriorsAndUpdateCells
   (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint);
 
-
 /*
  * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
  * Method:    jniSubComputeReadLikelihoodGivenHaplotypeLog10
  * Signature: (II[B[B[B[B[B[BI)D
  */
 JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10
-    (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
+  (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint);
 
 /*
  * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
- * Method:    jniGlobalInit
- * Signature: (Ljava/lang/Class;Ljava/lang/Class;)V
+ * Method:    jniInitializeHaplotypes
+ * Signature: (I[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;)V
  */
-JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniGlobalInit
-  (JNIEnv *, jobject, jclass, jclass);
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeHaplotypes
+  (JNIEnv *, jobject, jint, jobjectArray);
+
+/*
+ * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
+ * Method:    jniFinalizeRegion
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniFinalizeRegion
+  (JNIEnv *, jobject);
 
 /*
  * Class:     org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM
  * Method:    jniComputeLikelihoods
- * Signature: ([Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[D)V
+ * Signature: (II[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM/JNIHaplotypeDataHolderClass;[DI)V
  */
 JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniComputeLikelihoods
   (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint);
diff --git a/PairHMM_JNI/pairhmm-1-base.cc b/PairHMM_JNI/pairhmm-1-base.cc
index 90f95aeec..aaf45848b 100644
--- a/PairHMM_JNI/pairhmm-1-base.cc
+++ b/PairHMM_JNI/pairhmm-1-base.cc
@@ -1,123 +1,19 @@
 //#define DEBUG 1
 //#define DEBUG0_1 1
 //#define DEBUG3 1
-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
-
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <omp.h>
-#include <emmintrin.h>
+#include "headers.h"
 #include "template.h"
-
-//#include "define-float.h"
-//#include "shift_template.c"
-//#include "pairhmm-template-kernel.cc"
-
-#include "define-double.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
-
+#include "utils.h"
+#include "LoadTimeInitializer.h"
 
 using namespace std;
-class LoadTimeInitializer
-{
-  public:
-    LoadTimeInitializer()		//will be called when library is loaded
-    {
-      ConvertChar::init();
-    }
-};
-LoadTimeInitializer g_load_time_initializer;
 
 
-template<class NUMBER>
-NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
-{
-	int r, c;
-	int ROWS = tc->rslen + 1;
-	int COLS = tc->haplen + 1;
-
-	Context<NUMBER> ctx;
-
-	NUMBER M[MROWS][MCOLS];
-	NUMBER X[MROWS][MCOLS];
-	NUMBER Y[MROWS][MCOLS];
-	NUMBER p[MROWS][6];
-
-	p[0][MM] = ctx._(0.0);
-	p[0][GapM] = ctx._(0.0);
-	p[0][MX] = ctx._(0.0);
-	p[0][XX] = ctx._(0.0);
-	p[0][MY] = ctx._(0.0);
-	p[0][YY] = ctx._(0.0);
-	for (r = 1; r < ROWS; r++)
-	{
-		int _i = tc->i[r-1] & 127;
-		int _d = tc->d[r-1] & 127;
-		int _c = tc->c[r-1] & 127;
-		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
-		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
-		p[r][MX] = ctx.ph2pr[_i];
-		p[r][XX] = ctx.ph2pr[_c];
-		p[r][MY] = ctx.ph2pr[_d];
-		p[r][YY] = ctx.ph2pr[_c];
-		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
-	}
-
-	for (c = 0; c < COLS; c++)
-	{
-		M[0][c] = ctx._(0.0);
-		X[0][c] = ctx._(0.0);
-		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
-	}
-
-	for (r = 1; r < ROWS; r++)
-	{
-		M[r][0] = ctx._(0.0);
-		X[r][0] = X[r-1][0] * p[r][XX];
-		Y[r][0] = ctx._(0.0);
-	}
-
-	 NUMBER result = ctx._(0.0);
-
-	for (r = 1; r < ROWS; r++)
-		for (c = 1; c < COLS; c++)
-		{
-			char _rs = tc->rs[r-1];
-			char _hap = tc->hap[c-1];
-			int _q = tc->q[r-1] & 127;
-			NUMBER distm = ctx.ph2pr[_q];
-			if (_rs == _hap || _rs == 'N' || _hap == 'N')
-				distm = ctx._(1.0) - distm;
-			else
-			  distm = distm/3;
-			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
-			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
-			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
-		}
-
-	for (c = 0; c < COLS; c++)
-	{
-		result += M[ROWS-1][c] + X[ROWS-1][c];
-	}
-
-	if (before_last_log != NULL)
-		*before_last_log = result;
-
-	return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
-}
-
 #define BATCH_SIZE  10000
 #define RUN_HYBRID
 
+
+
 int main(int argc, char** argv)
 {
   if(argc < 2)
@@ -128,111 +24,82 @@ int main(int argc, char** argv)
   bool use_old_read_testcase = false;
   if(argc >= 3 && string(argv[2]) == "1")
     use_old_read_testcase = true;
+  unsigned chunk_size = 100;
+  if(argc >= 4)
+    chunk_size = strtol(argv[3],0,10);
 
-  testcase tc;
+  std::ifstream ifptr;
+  FILE* fptr = 0;
   if(use_old_read_testcase)
   {
-    FILE* fptr = fopen(argv[1],"r");
-    while(!feof(fptr))
-    {
-      if(read_testcase(&tc, fptr) >= 0)
-      {
-	double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
-	double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
-
-	cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
-	delete tc.rs;
-	delete tc.hap;
-      }
-    }
-    fclose(fptr);
+    fptr = fopen(argv[1],"r");
+    assert(fptr);
   }
   else
   {
-    std::ifstream ifptr;
-    std::vector<std::string> tokens;
     ifptr.open(argv[1]);
     assert(ifptr.is_open());
-    while(1)
-    {
-      tokens.clear();
-      if(read_mod_testcase(ifptr, &tc, false) < 0)
-	break;
-      //double result = 0;
-      double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
-      double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
-
-      cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
-      delete tc.rs;
-      delete tc.hap;
-    }
-    ifptr.close();
   }
-  return 0;  
 
-#if 0
-  float result[BATCH_SIZE], result_avxf;
-  double result_avxd;
-  struct timeval start, end;
-  long long aggregateTimeRead = 0L;
-  long long aggregateTimeCompute = 0L;
-  long long aggregateTimeWrite = 0L;
-
-  bool noMoreData = false;
-  int count =0;
-  while (!noMoreData)
+  vector<testcase> tc_vector;
+  tc_vector.clear();
+  testcase tc;
+  uint64_t total_time = 0;
+  while(1)
   {
-    int read_count = BATCH_SIZE;
-    gettimeofday(&start, NULL);
-    for (int b=0;b<BATCH_SIZE;b++)
-      if (read_testcase(&tc[b], NULL)==-1)
-      {
-	read_count = b;
-	noMoreData = true;
-	break;
-      }
-    gettimeofday(&end, NULL);
-    aggregateTimeRead += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
-
-    gettimeofday(&start, NULL);
-    for (int b=0;b<read_count;b++)
+    int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true);
+    if(break_value >= 0)
+      tc_vector.push_back(tc);
+    if(tc_vector.size() == BATCH_SIZE || (break_value < 0 && tc_vector.size() > 0))
     {
-      result_avxf = compute_full_prob<float>(&tc[b]);
+      vector<double> results_vec;
+      results_vec.clear();
+      results_vec.resize(tc_vector.size());
+      get_time();
+#pragma omp parallel for schedule(dynamic,chunk_size)  num_threads(12)
+      for(unsigned i=0;i<tc_vector.size();++i)
+      {
+	testcase& tc = tc_vector[i];
+	float result_avxf = g_compute_full_prob_float(&tc, 0);
+	double result = 0;
+	if (result_avxf < MIN_ACCEPTED) {
+	  double result_avxd = g_compute_full_prob_double(&tc, 0);
+	  result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+	}
+	else
+	  result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
 
-#ifdef RUN_HYBRID
-#define MIN_ACCEPTED 1e-28f
-      if (result_avxf < MIN_ACCEPTED) {
-	count++;
-	result_avxd = compute_full_prob<double>(&tc[b]);
-	result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
+	results_vec[i] = result;
       }
-      else
-	result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-#endif
-
-#ifndef RUN_HYBRID
-      result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f));
-#endif
-
+      total_time +=  get_time();
+#pragma omp parallel for schedule(dynamic,chunk_size)
+      for(unsigned i=0;i<tc_vector.size();++i)
+      {
+	testcase& tc = tc_vector[i];
+	double baseline_result = compute_full_prob<double>(&tc);
+	baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0)); 
+	double abs_error = fabs(baseline_result-results_vec[i]);
+	double rel_error = (baseline_result != 0) ? fabs(abs_error/baseline_result) : 0;
+	if(abs_error > 1e-5 && rel_error > 1e-5)
+	  cout << std::scientific << baseline_result << " "<<results_vec[i]<<"\n";
+	delete tc_vector[i].rs;
+	delete tc_vector[i].hap;
+	delete tc_vector[i].q;
+	delete tc_vector[i].i;
+	delete tc_vector[i].d;
+	delete tc_vector[i].c;
+      }
+      results_vec.clear();
+      tc_vector.clear();
     }
-    gettimeofday(&end, NULL);
-    aggregateTimeCompute += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
-
-    gettimeofday(&start, NULL);
-    for (int b=0;b<read_count;b++)
-      printf("%E\n", result[b]);
-    gettimeofday(&end, NULL);
-    aggregateTimeWrite += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
-
+    if(break_value < 0)
+      break;
   }
-
-  printf("AVX Read Time: %ld\n", aggregateTimeRead);
-  printf("AVX Compute Time: %ld\n", aggregateTimeCompute);
-  printf("AVX Write Time: %ld\n", aggregateTimeWrite);
-  printf("AVX Total Time: %ld\n", aggregateTimeRead + aggregateTimeCompute + aggregateTimeWrite);
-  printf("# Double called: %d\n", count);
-
-  return 0;
-#endif
+  cout << "Total time "<< ((double)total_time)/1e9 << "\n";
+  if(use_old_read_testcase)
+    fclose(fptr);
+  else
+    ifptr.close();
+  return 0;  
 }
 
diff --git a/PairHMM_JNI/pairhmm-template-kernel.cc b/PairHMM_JNI/pairhmm-template-kernel.cc
index 24bcf7d1a..66dc557aa 100644
--- a/PairHMM_JNI/pairhmm-template-kernel.cc
+++ b/PairHMM_JNI/pairhmm-template-kernel.cc
@@ -4,6 +4,10 @@
 #include <assert.h>
 #include <stdlib.h>
 
+//#define DEBUG
+#define MUSTAFA
+#define KARTHIK
+
 /*
 template <class T>
 string getBinaryStr (T val, int numBitsToWrite) {
@@ -17,8 +21,10 @@ string getBinaryStr (T val, int numBitsToWrite) {
   return oss.str() ;
 }
 */
+#ifdef MUSTAFA
 
-void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
+
+void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) {
   
   const int maskBitCnt = MAIN_TYPE_SIZE ;
 
@@ -34,14 +40,14 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i
     int mOffset = (col-1) % maskBitCnt ;
     MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;
 
-    char hapChar = tc.hap[col-1] ;
+    char hapChar = ConvertChar::get(tc.hap[col-1]);
 
     if (hapChar == AMBIG_CHAR) {
       for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) 
 	maskArr[mIndex][ci] |= bitMask ;
     } 
 
-    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+    maskArr[mIndex][hapChar] |= bitMask ;
     // bit corresponding to col 1 will be the MSB of the mask 0
     // bit corresponding to col 2 will be the MSB-1 of the mask 0
     // ...
@@ -52,7 +58,7 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i
 
 }
 
-void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
+void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) {
 
   for (int ri=0; ri < numRowsToProcess; ++ri) {
     rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ;
@@ -63,6 +69,7 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA
   }
 }
 
+
 #define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \
   MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ;			\
   MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \
@@ -71,33 +78,37 @@ void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsA
 }
 
 
-void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt) {
+void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) {
 
   for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
-    SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], 
+    SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], 
 		  lastMaskShiftOut[ei], ei, maskBitCnt) ;
     
     int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
-    SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], 
+    SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], 
 		  lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
   }
 
 }
 
-//void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
 
-inline void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
+//void GEN_INTRINSIC(computeDistVec, PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen, const _256_TYPE& distmSel, int firstRowIndex, int lastRowIndex) {
+
+inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (BITMASK_VEC& bitMaskVec, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen) {
   //#define computeDistVec() {					      
 
-  _256_TYPE maskV ;
-  VEC_SSE_TO_AVX(currMaskVecLow.vecf, currMaskVecHigh.vecf, maskV) ;
+#ifdef DEBUGG
+        long long *temp1 = (long long *)(&maskV);
+        double *temp2 = (double *)(&distm);
+        double *temp3 = (double *)(&_1_distm);
+        printf("***\n%lx\n%lx\n%f\n%f\n%f\n%f\n***\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
+#endif
 
-  distmChosen = VEC_BLENDV(distm, _1_distm, maskV) ;
+  distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ;
 
   /*COMPARE_VECS(distmChosen, distmSel, firstRowIndex, lastRowIndex) ;*/
 
-  VEC_SHIFT_LEFT_1BIT(currMaskVecLow.vec) ;
-  VEC_SHIFT_LEFT_1BIT(currMaskVecHigh.vec) ;
+  bitMaskVec.shift_left_1bit() ;
 }								     
 
 
@@ -120,8 +131,9 @@ struct HmmData {
 
 } ;
 */
+#endif // MUSTAFA
 
-template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
+template<class NUMBER> void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors, SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D)
 {
 	NUMBER zero = ctx._(0.0);
         NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
@@ -157,18 +169,14 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
                 *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c];
                 *(ptr_p_MX+r-1) = ctx.ph2pr[_i];
                 *(ptr_p_XX+r-1) = ctx.ph2pr[_c];
-                *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
-                *(ptr_p_YY+r-1) = ctx.ph2pr[_c];
-#ifdef DEBUG3
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MM+r-1)  ),true);
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_GAPM+r-1)),true);
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MX+r-1)  ),true);
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_XX+r-1)  ),true);
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_MY+r-1)  ),true);
-		debug_dump("transitions_jni.txt",to_string(*(ptr_p_YY+r-1)  ),true);
-#endif
-		//*(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-		//*(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+		#ifdef KARTHIK
+	                *(ptr_p_MY+r-1) = ctx.ph2pr[_d];
+        	        *(ptr_p_YY+r-1) = ctx.ph2pr[_c];			
+		#else
+ 	               *(ptr_p_MY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+        	       *(ptr_p_YY+r-1) = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+		#endif
+	
         }
 
         NUMBER *ptr_distm1D = (NUMBER *)distm1D;
@@ -176,14 +184,11 @@ template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS
         {
                 int _q = tc->q[r-1] & 127;
                 ptr_distm1D[r-1] = ctx.ph2pr[_q];
-#ifdef DEBUG3
-		debug_dump("priors_jni.txt",to_string(ptr_distm1D[r-1]),true);
-#endif
         }
 }
 
 
-template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(
+template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION, SIMD_TYPE), PRECISION)(
 			int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
 			_256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
 			_256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
@@ -200,19 +205,18 @@ template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)
 	NUMBER zero = ctx._(0.0);        
 	NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen);
 	UNION_TYPE packed1;  packed1.d = VEC_SET1_VAL(1.0);
-#define TRISTATE_CORRECTION_FACTOR 3.0
-	UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(TRISTATE_CORRECTION_FACTOR);
-	
+	UNION_TYPE packed3;  packed3.d = VEC_SET1_VAL(3.0);	
         /* compare rs and N */                                           
-        //rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
-        //rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
-                                                                         
+#ifndef MUSTAFA
+        rs = VEC_LDPOPCVT_CHAR((tc->irs+i*AVX_LENGTH));        
+        rsN.d = VEC_CMP_EQ(N_packed256, rs);                             
+#endif                                                                         
         distm = distm1D[i];                                    
-        _1_distm = VEC_SUB(packed1.d, distm);
-#ifndef DO_NOT_USE_TRISTATE_CORRECTION
-	distm = VEC_DIV(distm, packed3.d); 
-#endif
-                                                                         
+        _1_distm = VEC_SUB(packed1.d, distm);                  
+
+	#ifdef KARTHIK
+		distm = VEC_DIV(distm, packed3.d);
+	#endif                                                                         
         /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */        
         M_t_2.d = VEC_SET1_VAL(zero);                                    
         X_t_2.d = VEC_SET1_VAL(zero);                                    
@@ -234,7 +238,7 @@ template<class NUMBER> inline void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)
 
 
 
-inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
+inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM, SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
 						_256_TYPE distm, _256_TYPE _1_distm)
 {
         UNION_TYPE hapN, rshap;
@@ -263,12 +267,21 @@ inline _256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcas
 }        
 
 
-inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
 			UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
 			_256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel)
 {
 	/* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */
 	M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel);
+
+#ifdef DEBUG
+	double *temp1 = (double *)(&pGAPM);
+	double *temp2 = (double *)(&pMM);
+	double *temp3 = (double *)(&distmSel);
+	printf("%f\n%f\n%f\n%f\n%f\n%f\n", temp1[0], temp1[1], temp2[0], temp2[1], temp3[0], temp3[1]);
+	//printf("%f\n%f\n%f\n%f\n", X_t_2.f[0], X_t_2.f[1], Y_t_2.f[0], Y_t_2.f[1]);
+	printf("%f\n%f\n----------------------------------------------------------------------------\n", M_t.f[0], M_t.f[1]);
+#endif
 	M_t_y = M_t;
 
 	/* Compute X_t */
@@ -278,7 +291,7 @@ inline void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_
 	Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY));
 }
 
-template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
+template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL)
 {
         _256_TYPE p_MM   [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX   [MAVX_COUNT];
 	_256_TYPE p_XX   [MAVX_COUNT], p_MY   [MAVX_COUNT], p_YY   [MAVX_COUNT];
@@ -306,105 +319,114 @@ template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (t
         int remainingRows = (ROWS-1) % AVX_LENGTH;
         int strip_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0);
 
+#ifdef MUSTAFA
 	const int maskBitCnt = MAIN_TYPE_SIZE ;
 	const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function
 
 	MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ;
-	GEN_INTRINSIC(precompute_masks_, PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
+	GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ;
 
 	char rsArr[AVX_LENGTH] ;
 	MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ;
-
-	GEN_INTRINSIC(initializeVectors, PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, 
+#endif
+	GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)<NUMBER>(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, 
 							    ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D);
 
 	//for (int __ii=0; __ii < 10; ++__ii)
         for (int i=0;i<strip_cnt-1;i++)
         {
 		//STRIP_INITIALIZATION
-		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+		GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
 							      p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
-
-		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
-							      i*AVX_LENGTH+1, AVX_LENGTH) ;
+#ifdef MUSTAFA
+		GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, AVX_LENGTH) ;
+#endif
 		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
-		MASK_VEC currMaskVecLow ; // corresponding to lower half
-		MASK_VEC currMaskVecHigh ; // corresponding to upper half
 
-                for (int d=1;d<COLS+AVX_LENGTH;d++)
+		BITMASK_VEC bitMaskVec ;
+
+                for (int begin_d=1;begin_d<COLS+AVX_LENGTH;begin_d+=MAIN_TYPE_SIZE)
                 {
-		        if (d % MAIN_TYPE_SIZE == 1)
-			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+		  int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+AVX_LENGTH-begin_d) ;
+#ifdef MUSTAFA
+		  GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;		  
+#endif
 
-			int ShiftIdx = d+AVX_LENGTH;
-			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+		  //		        if (d % MAIN_TYPE_SIZE == 1)
+		  
+		  for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
+#ifdef MUSTAFA
+		    GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
+#else
+		    distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+#endif
+		    int ShiftIdx = begin_d + mbi + AVX_LENGTH;
 
-			//int firstRowIndex = (d < COLS) ? 0 : (d-COLS+1) ;
-			//int lastRowIndex = std::min(d-1, AVX_LENGTH-1) ;
-			//GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen, distmSel, firstRowIndex, lastRowIndex) ;
-
-			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
-			
-
-			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, 
+		    GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1, 
 								pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
 
-                        GEN_INTRINSIC(_vector_shift, PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[d]);
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx], shiftOutM[begin_d+mbi]);
+		    
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[begin_d+mbi]);
 
-                        GEN_INTRINSIC(_vector_shift, PRECISION)(X_t, shiftOutX[ShiftIdx], shiftOutX[d]);
-
-                        GEN_INTRINSIC(_vector_shift, PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[d]);
-
-                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
-                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx], shiftOutY[begin_d+mbi]);
 
+		    M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+		    Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+		  }
 		}
 	}
-
+	
         int i = strip_cnt-1;
         {
 		//STRIP_INITIALIZATION
-		GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
+		GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(i, ctx, tc, pGAPM, pMM, pMX, pXX, pMY, pYY, rs.d, rsN, distm, _1_distm, distm1D, N_packed256, p_MM , p_GAPM ,
                         p_MX, p_XX , p_MY, p_YY, M_t_2, X_t_2, M_t_1, X_t_1, Y_t_2, Y_t_1, M_t_1_y, shiftOutX, shiftOutM);
 
                 if (remainingRows==0) remainingRows=AVX_LENGTH;
-
-		GEN_INTRINSIC(init_masks_for_row_, PRECISION)(*tc, rsArr, lastMaskShiftOut,
-							      i*AVX_LENGTH+1, remainingRows) ;
-
+#ifdef MUSTAFA
+		GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(*tc, rsArr, lastMaskShiftOut, i*AVX_LENGTH+1, remainingRows) ;
+#endif
                 _256_TYPE sumM, sumX;
                 sumM = VEC_SET1_VAL(zero);
                 sumX = VEC_SET1_VAL(zero);
 
 		// Since there are no shift intrinsics in AVX, keep the masks in 2 SSE vectors
-		MASK_VEC currMaskVecLow ; // corresponding to lower half
-		MASK_VEC currMaskVecHigh ; // corresponding to upper half
+		BITMASK_VEC bitMaskVec ;
 
-                for (int d=1;d<COLS+remainingRows-1;d++)
+                for (int begin_d=1;begin_d<COLS+remainingRows-1;begin_d+=MAIN_TYPE_SIZE)
                 {
 
-  	 	        if (d % MAIN_TYPE_SIZE == 1)
-			  GEN_INTRINSIC(update_masks_for_cols_, PRECISION)((d-1)/MAIN_TYPE_SIZE, currMaskVecLow, currMaskVecHigh, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+		  int numMaskBitsToProcess = std::min(MAIN_TYPE_SIZE, COLS+remainingRows-1-begin_d) ;
+#ifdef MUSTAFA
+		  GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_, SIMD_TYPE),PRECISION)((begin_d-1)/MAIN_TYPE_SIZE, bitMaskVec, maskArr, rsArr, lastMaskShiftOut, maskBitCnt) ;
+#endif
 
-			int ShiftIdx = d+AVX_LENGTH;
-			//distmSel = GEN_INTRINSIC(computeDISTM, PRECISION)(d, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
-			GEN_INTRINSIC(computeDistVec, PRECISION) (currMaskVecLow, currMaskVecHigh, distm, _1_distm, distmChosen) ;
+		  for (int mbi=0; mbi < numMaskBitsToProcess; ++mbi) {
 
-			GEN_INTRINSIC(computeMXY, PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
-					                        pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
+#ifdef MUSTAFA
+		    GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec, SIMD_TYPE), PRECISION) (bitMaskVec, distm, _1_distm, distmChosen) ;
+#else
+		    distmChosen = GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(begin_d+mbi, COLS, tc, hap, rs.d, rsN, N_packed256, distm, _1_distm);
+#endif
+		    int ShiftIdx = begin_d + mbi +AVX_LENGTH;
 
-                        sumM  = VEC_ADD(sumM, M_t.d);
-                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(M_t, shiftOutM[ShiftIdx]);
+		    GEN_INTRINSIC(GEN_INTRINSIC(computeMXY, SIMD_TYPE), PRECISION)(M_t, X_t, Y_t, M_t_y, M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, M_t_1_y, Y_t_1,
+										   pMM, pGAPM, pMX, pXX, pMY, pYY, distmChosen);
 
-                        sumX  = VEC_ADD(sumX, X_t.d);
-                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(X_t, shiftOutX[ShiftIdx]);
+		    sumM  = VEC_ADD(sumM, M_t.d);
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(M_t, shiftOutM[ShiftIdx]);
 
-                        GEN_INTRINSIC(_vector_shift_last, PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
+		    sumX  = VEC_ADD(sumX, X_t.d);
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(X_t, shiftOutX[ShiftIdx]);
 
-                        M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
-                        Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+		    GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION)(Y_t_1, shiftOutY[ShiftIdx]);
 
-                }
+		    M_t_2 = M_t_1; M_t_1 = M_t; X_t_2 = X_t_1; X_t_1 = X_t;
+		    Y_t_2 = Y_t_1; Y_t_1 = Y_t; M_t_1_y = M_t_y;
+
+		  }
+		}
                 UNION_TYPE sumMX;
                 sumMX.d = VEC_ADD(sumM, sumX);
                 result_avx2 = sumMX.f[remainingRows-1];		
diff --git a/PairHMM_JNI/pairhmm-template-main.cc b/PairHMM_JNI/pairhmm-template-main.cc
index 9aeb0dddd..0ab53d378 100644
--- a/PairHMM_JNI/pairhmm-template-main.cc
+++ b/PairHMM_JNI/pairhmm-template-main.cc
@@ -1,35 +1,57 @@
-#include <stdio.h>
-#include <immintrin.h>
-#include <emmintrin.h>
-#include <omp.h>
+#include "headers.h"
 
 #include "template.h"
+#include "vector_defs.h"
 
-#include "define-float.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
+#define SIMD_TYPE avx
+#define SIMD_TYPE_AVX
 
-#include "define-double.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
 
 #define BATCH_SIZE  10000
-//#define RUN_HYBRID
+#define RUN_HYBRID
 
-//uint8_t ConvertChar::conversionTable[255] ;
+double getCurrClk();
 int thread_level_parallelism_enabled = false ;
 
-double getCurrClk() {
-  struct timeval tv ;
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+void print128b_F(__m128 x)
+{	
+	float *p = (float *)(&x);
+	for (int i=3;i>=0;i--)
+		printf("%f ", *(p+i));
+	printf("\n");
 }
 
+void print128b_D(__m128d x)
+{
+        double *p = (double *)(&x);
+        for (int i=1;i>=0;i--)
+                printf("%f ", *(p+i));
+        printf("\n");
+}
 
 int main()
 {
-        
-	testcase tc[BATCH_SIZE];
+/*
+	IF_128f x;
+	x.f = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+	IF_32 shiftIn, shiftOut;
+	shiftIn.f = 5.0f;
+	print128b_F(x.f);
+	GEN_INTRINSIC(_vector_shift, s)(x, shiftIn, shiftOut);
+	print128b_F(x.f);
+
+        IF_128d y;
+        y.f = _mm_set_pd(10.0, 11.0);
+        IF_64 shiftInd, shiftOutd;
+        shiftInd.f = 12.0;
+        print128b_D(y.f);
+        GEN_INTRINSIC(_vector_shift, d)(y, shiftInd, shiftOutd);
+        print128b_D(y.f);
+
+	exit(0);
+*/
+
+        testcase* tc = new testcase[BATCH_SIZE];
         float result[BATCH_SIZE], result_avxf;
         double result_avxd;
         //struct timeval start, end;
@@ -40,12 +62,12 @@ int main()
 
 	// Need to call it once to initialize the static array
 	ConvertChar::init() ;
+	
 
-
-	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
-	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
-	  thread_level_parallelism_enabled = true ;
-	}
+//	char* ompEnvVar = getenv("OMP_NUM_THREADS") ;
+//	if (ompEnvVar != NULL && ompEnvVar != "" && ompEnvVar != "1" ) {
+//	  thread_level_parallelism_enabled = true ;
+//	}
 
         bool noMoreData = false;
         int count =0;
@@ -55,7 +77,7 @@ int main()
  
 		lastClk = getCurrClk() ;
                 for (int b=0;b<BATCH_SIZE;b++)
-                        if (read_testcase(&tc[b], NULL)==-1)
+                        if (read_testcase(&tc[b])==-1)
                         {
                                 read_count = b;
                                 noMoreData = true;
@@ -68,16 +90,17 @@ int main()
                 //gettimeofday(&start, NULL);
 		lastClk = getCurrClk() ;
 
-#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
+//#pragma omp parallel for schedule(dynamic) if(thread_level_parallelism_enabled)
                 for (int b=0;b<read_count;b++)
                 {
-                        result_avxf = GEN_INTRINSIC(compute_full_prob_avx, s)<float>(&tc[b]);
+                        result_avxf = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), s)<float>(&tc[b]);
 
                         #ifdef RUN_HYBRID
                                 #define MIN_ACCEPTED 1e-28f
                                 if (result_avxf < MIN_ACCEPTED) {
+				      //printf("**************** RUNNING DOUBLE ******************\n");
                                       count++;
-                                      result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc[b]);
+                                      result_avxd = GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_, SIMD_TYPE), d)<double>(&tc[b]);
                                       result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f));
                                 }
                                 else
@@ -95,14 +118,15 @@ int main()
 
                 //gettimeofday(&start, NULL);
 		lastClk = getCurrClk() ;
-                for (int b=0;b<read_count;b++)
-                        printf("%E\n", result[b]);
+		//for (int b=0;b<read_count;b++)
+		//printf("%E\n", result[b]);
                 //gettimeofday(&end, NULL);
                 aggregateTimeWrite += (getCurrClk() - lastClk) ;
 		//((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
 
         }
 
+	delete tc;
         printf("AVX Read Time: %.2f\n", aggregateTimeRead);
         printf("AVX Compute Time: %.2f\n", aggregateTimeCompute);
         printf("AVX Write Time: %.2f\n", aggregateTimeWrite);
diff --git a/PairHMM_JNI/run.sh b/PairHMM_JNI/run.sh
index 8b8d44b54..1e82092e8 100755
--- a/PairHMM_JNI/run.sh
+++ b/PairHMM_JNI/run.sh
@@ -1,16 +1,25 @@
 #!/bin/bash
-rm -f *.txt
-export GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable
+rm -f *.txt *.log
+GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable
 #-Djava.library.path is needed if you are using JNI_LOGLESS_CACHING, else not needed
-java  -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar ${GSA_ROOT_DIR}/dist/GenomeAnalysisTK.jar  -T HaplotypeCaller \
--R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
--I /data/simulated/sim1M_pairs_final.bam \
+java  -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar $GSA_ROOT_DIR/dist/GenomeAnalysisTK.jar   -T HaplotypeCaller \
+-R /data/broad/samples/joint_variant_calling/broad_reference/human_g1k_v37_decoy.fasta \
+-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
 --dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
 -stand_call_conf 50.0 \
 -stand_emit_conf 10.0 \
 --pair_hmm_implementation JNI_LOGLESS_CACHING \
 -o output.raw.snps.indels.vcf
 
+#-XL unmapped	\
 #--pair_hmm_implementation JNI_LOGLESS_CACHING \
 #-I /data/simulated/sim1M_pairs_final.bam \
 #-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
+#-I /data/broad/samples/joint_variant_calling/NA12878_high_coverage_alignment/NA12878.mapped.ILLUMINA.bwa.CEU.high_coverage_pcr_free.20130906.bam \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly18.fasta \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/ucsc.hg19.fasta \
+#-R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/human_g1k_v37_decoy.fasta \
+#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
+#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/dbsnp_138.hg19.vcf \
diff --git a/PairHMM_JNI/shift_template.c b/PairHMM_JNI/shift_template.c
index 90160dcb3..db4a4f6f7 100644
--- a/PairHMM_JNI/shift_template.c
+++ b/PairHMM_JNI/shift_template.c
@@ -1,9 +1,8 @@
 #ifdef PRECISION 
 
-#include "template.h"
+#ifdef SIMD_TYPE_AVX
 
-
-inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
 {
 
         IF_128 xlow , xhigh;
@@ -33,7 +32,8 @@ inline void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE sh
         x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
 }
 
-inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
 {
 
         IF_128 xlow , xhigh;
@@ -44,10 +44,6 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY
         /* extract xlow[3] */
         IF_128 shiftOutL128;
         shiftOutL128.i = _mm_srli_si128(xlow.i, SHIFT_CONST1);
-        /* extract xhigh[3] */
-//        IF_MAIN_TYPE shiftOutH;
-//        shiftOutH.i = VEC_EXTRACT_UNIT(xhigh.i, SHIFT_CONST2);
-//        shiftOut = shiftOutH.f;
         /* shift xlow */
         xlow.i = _mm_slli_si128 (xlow.i, SHIFT_CONST3);
         /* shift xhigh */
@@ -63,6 +59,32 @@ inline void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TY
         x.d = VEC_INSERT_VAL(x.d, xhigh.f, 1);
 }
 
+#endif
+
+#ifdef SIMD_TYPE_SSE
+
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut)
+{
+	IF_MAIN_TYPE tempIn, tempOut; 
+	tempIn.f = shiftIn;
+        /* extratc H */
+        tempOut.i = VEC_EXTRACT_UNIT(x.i, SHIFT_CONST1);
+	shiftOut = tempOut.f;
+        /* shift     */
+        x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
+        /* insert  L */
+        x.i = VEC_INSERT_UNIT(x.i , tempIn.i, SHIFT_CONST3);
+}
+
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last, SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn)
+{
+	IF_MAIN_TYPE temp; temp.f = shiftIn;
+        /* shift     */
+        x.i = _mm_slli_si128(x.i, SHIFT_CONST2);
+        /* insert  L */
+        x.i = VEC_INSERT_UNIT(x.i , temp.i, SHIFT_CONST3);
+}
 
 #endif
 
+#endif
diff --git a/PairHMM_JNI/sse_function_instantiations.cc b/PairHMM_JNI/sse_function_instantiations.cc
new file mode 100644
index 000000000..38686ada2
--- /dev/null
+++ b/PairHMM_JNI/sse_function_instantiations.cc
@@ -0,0 +1,18 @@
+#include "template.h"
+
+#undef SIMD_TYPE
+#undef SIMD_TYPE_AVX
+
+#define SIMD_TYPE sse
+#define SIMD_TYPE_SSE
+
+#include "define-sse-float.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#include "define-sse-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+template double compute_full_prob_ssed<double>(testcase* tc, double* nextlog);
+template float compute_full_prob_sses<float>(testcase* tc, float* nextlog);
diff --git a/PairHMM_JNI/template.h b/PairHMM_JNI/template.h
index e70784073..ba6370de8 100644
--- a/PairHMM_JNI/template.h
+++ b/PairHMM_JNI/template.h
@@ -1,24 +1,14 @@
 #ifndef TEMPLATES_H_
 #define TEMPLATES_H_
 
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stdint.h>
-#include <assert.h>
-
-#include <sys/time.h>
-
-#include <immintrin.h>
-
-#include <ctype.h>
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <vector>
+#include "headers.h"
 
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
 
 #define MROWS  500
 #define MCOLS  1000
@@ -35,12 +25,25 @@ typedef union __attribute__((aligned(32))) {
         ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_F ALIGNED;
 
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m128 ALIGNED d;
+        ALIGNED __m64 ALIGNED s[2];
+        ALIGNED float  ALIGNED f[4];
+        ALIGNED __m128i ALIGNED i;
+} ALIGNED mix_F128 ALIGNED;
+
 typedef union ALIGNED {
   __m128i vec ;
   __m128 vecf ;
   uint32_t masks[4] ;
 } MaskVec_F ;
 
+typedef union ALIGNED {
+  __m64 vec ;
+  __m64 vecf ;
+  uint32_t masks[2] ;
+} MaskVec_F128 ;
+
 typedef union ALIGNED
 {
         ALIGNED __m128i ALIGNED i;
@@ -60,12 +63,25 @@ typedef union __attribute__((aligned(32))) {
         ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_D ALIGNED;
 
+typedef union __attribute__((aligned(32))) {
+        ALIGNED __m128d ALIGNED d;
+        ALIGNED __m64 ALIGNED s[2];
+        ALIGNED double  ALIGNED f[2];
+        ALIGNED __m128i ALIGNED i;
+} ALIGNED mix_D128 ALIGNED;
+
 typedef union ALIGNED {
   __m128i vec ;
   __m128d vecf ;
   uint64_t masks[2] ;
 } MaskVec_D ;
 
+typedef union ALIGNED {
+  __m64 vec ;
+  __m64 vecf ;
+  uint64_t masks[1] ;
+} MaskVec_D128 ;
+
 typedef union ALIGNED
 {
         ALIGNED __m128i ALIGNED i;
@@ -134,30 +150,18 @@ typedef struct
 {
         int rslen, haplen;
 	/*int *q, *i, *d, *c;*/
-	int q[MROWS], i[MROWS], d[MROWS], c[MROWS];
+	/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
+	char *q, *i, *d, *c;
         char *hap, *rs;
 	int *ihap;
 	int *irs;
 } testcase;
 
-
-template<class T>
-std::string to_string(T obj)
-{
-  std::stringstream ss;
-  std::string ret_string;
-  ss.clear();
-  ss << std::scientific << obj;
-  ss >> ret_string;
-  ss.clear();
-  return ret_string;
-}
-void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
-
 int normalize(char c);
-int read_testcase(testcase *tc, FILE* ifp);
-int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
+int read_testcase(testcase *tc, FILE* ifp=0);
 
+
+#define MIN_ACCEPTED 1e-28f
 #define NUM_DISTINCT_CHARS 5
 #define AMBIG_CHAR 4
 
@@ -182,7 +186,7 @@ public:
     return conversionTable[input] ;
   }
 
-} ;
+};
 
 
 #endif
diff --git a/PairHMM_JNI/convert_char.cc b/PairHMM_JNI/utils.cc
similarity index 62%
rename from PairHMM_JNI/convert_char.cc
rename to PairHMM_JNI/utils.cc
index 01c5a5137..da6509763 100644
--- a/PairHMM_JNI/convert_char.cc
+++ b/PairHMM_JNI/utils.cc
@@ -1,7 +1,62 @@
+#include "headers.h"
 #include "template.h"
+#include "utils.h"
+#include "vector_defs.h"
+
 uint8_t ConvertChar::conversionTable[255];
+float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
+double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
+
 using namespace std;
 
+bool is_avx_supported()
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__("cpuid"
+      : "=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      : "a" (1)
+      );
+  return ((ecx >> 28)&1) == 1;
+}
+
+bool is_sse42_supported()
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__("cpuid"
+      : "=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      : "a" (1)
+      );
+  return ((ecx >> 20)&1) == 1;
+}
+
+void initialize_function_pointers()
+{
+  //if(false)
+  if(is_avx_supported())
+  {
+    cout << "Using AVX accelerated implementation of PairHMM\n";
+    g_compute_full_prob_float = compute_full_prob_avxs<float>;
+    g_compute_full_prob_double = compute_full_prob_avxd<double>;
+  }
+  else
+    if(is_sse42_supported())
+    {
+      cout << "Using SSE4.2 accelerated implementation of PairHMM\n";
+      g_compute_full_prob_float = compute_full_prob_sses<float>;
+      g_compute_full_prob_double = compute_full_prob_ssed<double>;
+    }
+    else
+    {
+      cout << "Using un-vectorized C++ implementation of PairHMM\n";
+      g_compute_full_prob_float = compute_full_prob<float>;
+      g_compute_full_prob_double = compute_full_prob<double>;
+    }
+}
+
 int normalize(char c)
 {
 	return ((int) (c - 33));
@@ -35,10 +90,10 @@ int read_testcase(testcase *tc, FILE* ifp)
 	tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
 	tc->irs = (int *) malloc(tc->rslen*sizeof(int));
 
-	//tc->q = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->i = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->d = (int *) malloc(sizeof(int) * tc->rslen);
-	//tc->c = (int *) malloc(sizeof(int) * tc->rslen);
+	tc->q = (char *) malloc(sizeof(char) * tc->rslen);
+	tc->i = (char *) malloc(sizeof(char) * tc->rslen);
+	tc->d = (char *) malloc(sizeof(char) * tc->rslen);
+	tc->c = (char *) malloc(sizeof(char) * tc->rslen);
 
 	for (x = 0; x < tc->rslen; x++)
 	{
@@ -144,18 +199,22 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
   memcpy(tc->hap, tokens[0].c_str(), tokens[0].size());
   tc->rs = new char[tokens[1].size()+2];
   tc->rslen = tokens[1].size();
+  tc->q = new char[tc->rslen];
+  tc->i = new char[tc->rslen];
+  tc->d = new char[tc->rslen];
+  tc->c = new char[tc->rslen];
   //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
   memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
   assert(tokens.size() == 2 + 4*(tc->rslen));
   assert(tc->rslen < MROWS);
   for(unsigned j=0;j<tc->rslen;++j)
-    tc->q[j] = convToInt(tokens[2+0*tc->rslen+j]);
+    tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
   for(unsigned j=0;j<tc->rslen;++j)
-    tc->i[j] = convToInt(tokens[2+1*tc->rslen+j]);
+    tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
   for(unsigned j=0;j<tc->rslen;++j)
-    tc->d[j] = convToInt(tokens[2+2*tc->rslen+j]);
+    tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
   for(unsigned j=0;j<tc->rslen;++j)
-    tc->c[j] = convToInt(tokens[2+3*tc->rslen+j]);
+    tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
  
   if(reformat)
   {
@@ -184,3 +243,20 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
 
   return tokens.size();
 }
+
+double getCurrClk() {
+  struct timeval tv ;
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+uint64_t get_time(struct timespec* store_struct)
+{
+  static struct timespec start_time;
+  struct timespec curr_time;
+  struct timespec* ptr = (store_struct == 0) ? &curr_time : store_struct;
+  clock_gettime(CLOCK_REALTIME, ptr);
+  uint64_t diff_time = (ptr->tv_sec-start_time.tv_sec)*1000000000+(ptr->tv_nsec-start_time.tv_nsec);
+  start_time = *ptr;
+  return diff_time;
+}
diff --git a/PairHMM_JNI/utils.h b/PairHMM_JNI/utils.h
new file mode 100644
index 000000000..cc020ba40
--- /dev/null
+++ b/PairHMM_JNI/utils.h
@@ -0,0 +1,31 @@
+#ifndef PAIRHMM_UTIL_H
+#define PAIRHMM_UTIL_H
+
+#include "template.h"
+
+template<class T>
+std::string to_string(T obj)
+{
+  std::stringstream ss;
+  std::string ret_string;
+  ss.clear();
+  ss << std::scientific << obj;
+  ss >> ret_string;
+  ss.clear();
+  return ret_string;
+}
+void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
+
+int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false);
+
+bool is_avx_supported();
+bool is_sse42_supported();
+extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log);
+extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log);
+void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
+void initialize_function_pointers();
+double getCurrClk();
+uint64_t get_time(struct timespec* x=0);
+#endif
diff --git a/PairHMM_JNI/vector_defs.h b/PairHMM_JNI/vector_defs.h
new file mode 100644
index 000000000..7958480f2
--- /dev/null
+++ b/PairHMM_JNI/vector_defs.h
@@ -0,0 +1,30 @@
+#undef SIMD_TYPE
+#undef SIMD_TYPE_AVX
+#undef SIMD_TYPE_SSE
+
+#define SIMD_TYPE avx
+#define SIMD_TYPE_AVX
+
+#include "define-float.h"
+#include "vector_function_prototypes.h"
+
+#include "define-double.h"
+#include "vector_function_prototypes.h"
+
+#undef  SIMD_TYPE
+#undef  SIMD_TYPE_AVX
+
+#define SIMD_TYPE sse
+#define SIMD_TYPE_SSE
+
+
+#include "define-sse-float.h"
+#include "vector_function_prototypes.h"
+
+#include "define-sse-double.h"
+#include "vector_function_prototypes.h"
+
+#undef SIMD_TYPE
+#undef SIMD_TYPE_AVX
+#undef SIMD_TYPE_SSE
+
diff --git a/PairHMM_JNI/vector_function_prototypes.h b/PairHMM_JNI/vector_function_prototypes.h
new file mode 100644
index 000000000..ce9cc2abc
--- /dev/null
+++ b/PairHMM_JNI/vector_function_prototypes.h
@@ -0,0 +1,19 @@
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(_vector_shift_last,SIMD_TYPE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(precompute_masks_,SIMD_TYPE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(init_masks_for_row_,SIMD_TYPE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(update_masks_for_cols_,SIMD_TYPE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(computeDistVec,SIMD_TYPE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen);
+template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(initializeVectors,SIMD_TYPE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D);
+template<class NUMBER> inline void GEN_INTRINSIC(GEN_INTRINSIC(stripINITIALIZATION,SIMD_TYPE), PRECISION)(
+    int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
+    _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
+    _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
+    UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
+inline _256_TYPE GEN_INTRINSIC(GEN_INTRINSIC(computeDISTM,SIMD_TYPE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
+    _256_TYPE distm, _256_TYPE _1_distm);
+inline void GEN_INTRINSIC(GEN_INTRINSIC(computeMXY,SIMD_TYPE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+    UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
+    _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel);
+template<class NUMBER> NUMBER GEN_INTRINSIC(GEN_INTRINSIC(compute_full_prob_,SIMD_TYPE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 82015d153..b785b8d21 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -1086,7 +1086,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
         referenceConfidenceModel.close();
         //TODO remove the need to call close here for debugging, the likelihood output stream should be managed
         //TODO (open & close) at the walker, not the engine.
-        //likelihoodCalculationEngine.close();
+	likelihoodCalculationEngine.close();
         logger.info("Ran local assembly on " + result + " active regions");
     }
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 0626f2268..04e64186f 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -89,4 +89,6 @@ public interface LikelihoodCalculationEngine {
      */
     public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(AssemblyResultSet assemblyResultSet,
                                Map<String, List<GATKSAMRecord>> perSampleReadList);
+
+    public void close();
 }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
index 0a82e1997..b91805a62 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java
@@ -165,8 +165,10 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
         }
     }
 
+    @Override
     public void close() {
         if ( likelihoodsStream != null ) likelihoodsStream.close();
+	pairHMMThreadLocal.get().close();
     }
 
     private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
@@ -330,7 +332,12 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
         }
 
         // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
-        pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
+        pairHMMThreadLocal.get().initialize(haplotypes, perSampleReadList, X_METRIC_LENGTH, Y_METRIC_LENGTH);
+    }
+
+    private void finalizePairHMM()
+    {
+	pairHMMThreadLocal.get().finalizeRegion();
     }
 
 
@@ -350,6 +357,9 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
             map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
             stratifiedReadMap.put(sampleEntry.getKey(), map);
         }
+	
+	//Used mostly by the JNI implementation(s) to free arrays
+	finalizePairHMM();
 
         return stratifiedReadMap;
     }
diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
index 58b8fc591..6cbd3510c 100644
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
@@ -57,6 +57,7 @@ import org.broadinstitute.variant.variantcontext.Allele;
 
 import java.util.List;
 import java.util.Map;
+import java.util.HashMap;
 
 
 /**
@@ -67,12 +68,13 @@ import java.util.Map;
 public class JNILoglessPairHMM extends LoglessPairHMM {
 
     private static final boolean debug = false;	 //simulates ifdef
+    private static final boolean verify = debug || false;	 //simulates ifdef
     private static final boolean debug0_1 = false;	 //simulates ifdef
     private static final boolean debug1 = false; //simulates ifdef
     private static final boolean debug2 = false;
     private static final boolean debug3 = false;
     private int numComputeLikelihoodCalls = 0;
-
+    
     //Used to copy references to byteArrays to JNI from reads
     protected class JNIReadDataHolderClass
     {
@@ -90,6 +92,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
     }
     
     private native void jniGlobalInit(Class<?> readDataHolderClass, Class<?> haplotypeDataHolderClass);
+    private native void jniClose();
     
     private static boolean isJNILoglessPairHMMLibraryLoaded = false;
     
@@ -104,6 +107,13 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	}
     }
 
+    @Override
+    public void close()
+    {
+	jniClose();
+	debugClose();
+    }
+
     //Used to test parts of the compute kernel separately    
     private native void jniInitialize(final int readMaxLength, final int haplotypeMaxLength);
     private native static void jniInitializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP);
@@ -126,7 +136,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
     @Override
     public void initialize(final int readMaxLength, final int haplotypeMaxLength)
     {
-      if(debug)
+      if(verify)
 	super.initialize(readMaxLength, haplotypeMaxLength);
       if(debug3)
       {
@@ -138,6 +148,42 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	  jniInitialize(readMaxLength, haplotypeMaxLength);
     }
 
+    private HashMap<Haplotype,Integer> haplotypeToHaplotypeListIdxMap = new HashMap<Haplotype,Integer>();
+    private native void jniInitializeHaplotypes(final int numHaplotypes,  JNIHaplotypeDataHolderClass[] haplotypeDataArray);
+    //Used to transfer data to JNI
+    //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region
+     /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void initialize( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) {
+	if(verify)
+	    super.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength);
+	int numHaplotypes = haplotypes.size();
+	JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes];
+	int idx = 0;
+	for(final Haplotype currHaplotype : haplotypes)
+	{
+	    haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
+	    haplotypeDataArray[idx].haplotypeBases = currHaplotype.getBases();
+	    haplotypeToHaplotypeListIdxMap.put(currHaplotype, idx);
+	    ++idx;
+	}
+	jniInitializeHaplotypes(numHaplotypes, haplotypeDataArray);
+    }
+
+    private native void jniFinalizeRegion();
+    //Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not
+    //accessing Java memory directly, still important to release memory from C++
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public void finalizeRegion()
+    {
+	jniFinalizeRegion();
+    }
+
     //Real compute kernel
     private native void jniComputeLikelihoods(int numReads, int numHaplotypes,
 	    JNIReadDataHolderClass[] readDataArray, JNIHaplotypeDataHolderClass[] haplotypeDataArray,
@@ -149,19 +195,20 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
     public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap)
     {
 	// (re)initialize the pairHMM only if necessary
-	final int readMaxLength = debug ? findMaxReadLength(reads) : 0;
-	final int haplotypeMaxLength = debug ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0;
-	if(debug)
+	final int readMaxLength = verify ? findMaxReadLength(reads) : 0;
+	final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0;
+	if(verify)
 	{
 	    if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
 	    { initialize(readMaxLength, haplotypeMaxLength); }
 	    if ( ! initialized )
-		throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug mode");
+		throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode");
 	}
 	int readListSize = reads.size();
-	int alleleHaplotypeMapSize = alleleHaplotypeMap.size();
+	int numHaplotypes = alleleHaplotypeMap.size();
+	int numTestcases = readListSize*numHaplotypes;
 	if(debug0_1)
-	    System.out.println("Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize);
+	    System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
 	JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize];
 	int idx = 0;
 	for(GATKSAMRecord read : reads)
@@ -188,39 +235,68 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	    }
 	    ++idx;
 	}
-	JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[alleleHaplotypeMapSize];
-	idx = 0;
-	for (Map.Entry<Allele, Haplotype>  currEntry : alleleHaplotypeMap.entrySet())	//order is important - access in same order always
+
+	JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes];
+	if(verify)
 	{
-	    haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
-	    haplotypeDataArray[idx].haplotypeBases = currEntry.getValue().getBases();
-	    if(debug0_1)
-		System.out.println("Java haplotype length "+haplotypeDataArray[idx].haplotypeBases.length);
-	    if(debug3)
+	    idx = 0;
+	    for (Map.Entry<Allele, Haplotype>  currEntry : alleleHaplotypeMap.entrySet())	//order is important - access in same order always
 	    {
-		for(int i=0;i<haplotypeDataArray[idx].haplotypeBases.length;++i)
-		    debugDump("haplotype_bases_java.txt",String.format("%d\n",(int)haplotypeDataArray[idx].haplotypeBases[i]),true);
+		haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass();
+		haplotypeDataArray[idx].haplotypeBases = currEntry.getValue().getBases();
+		if(debug0_1)
+		    System.out.println("Java haplotype length "+haplotypeDataArray[idx].haplotypeBases.length);
+		if(debug3)
+		{
+		    for(int i=0;i<haplotypeDataArray[idx].haplotypeBases.length;++i)
+			debugDump("haplotype_bases_java.txt",String.format("%d\n",(int)haplotypeDataArray[idx].haplotypeBases[i]),true);
+		}
+		++idx;
 	    }
-	    ++idx;
 	}
-	double[] likelihoodArray = new double[readListSize*alleleHaplotypeMapSize];	//to store results
+	double[] likelihoodArray = new double[readListSize*numHaplotypes];	//to store results
 	//for(reads)
 	//   for(haplotypes)
 	//       compute_full_prob()
-	jniComputeLikelihoods(readListSize, alleleHaplotypeMapSize, readDataArray, haplotypeDataArray, likelihoodArray, 12);
+	jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, haplotypeDataArray, likelihoodArray, 12);
 
 
 	//final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
 	PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
 	idx = 0;
+	int idxInsideHaplotypeList = 0;
+	int readIdx = 0;
 	for(GATKSAMRecord read : reads)
+	{
 	    for (Map.Entry<Allele, Haplotype>  currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
 	    {
-		likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]);
+		//Since the order of haplotypes in the List<Haplotype> and alleleHaplotypeMap is different,
+		//get idx of current haplotype in the list and use this idx to get the right likelihoodValue
+		idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
+		likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[readIdx + idxInsideHaplotypeList]);
 		++idx;
 	    }
-	if(debug)
+	    readIdx += numHaplotypes;
+	}
+	if(verify)
 	{
+	    //re-order values in likelihoodArray
+	    double[] tmpArray = new double[numHaplotypes];
+	    idx = 0;
+	    idxInsideHaplotypeList = 0;
+	    readIdx = 0;
+	    for(GATKSAMRecord read : reads)
+	    {
+		for(int j=0;j<numHaplotypes;++j)
+		    tmpArray[j] = likelihoodArray[readIdx+j];
+		for (Map.Entry<Allele, Haplotype>  currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
+		{
+		    idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue());
+		    likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList];
+		    ++idx;
+		}
+		readIdx += numHaplotypes;
+	    }
 	    //to compare values
 	    likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap);
 	    //for floating point values, no exact equality
@@ -246,23 +322,23 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	    if(toDump)
 	    {
 		idx = 0;
-		System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize);
+		System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
 		for(int i=0;i<readDataArray.length;++i)
 		{
 		    for(int j=0;j<haplotypeDataArray.length;++j)
 		    {
-			debugDump("debug_dump.log",new String(haplotypeDataArray[j].haplotypeBases)+" ",true);
-			debugDump("debug_dump.log",new String(readDataArray[i].readBases)+" ",true);
+			debugDump("debug_dump.txt",new String(haplotypeDataArray[j].haplotypeBases)+" ",true);
+			debugDump("debug_dump.txt",new String(readDataArray[i].readBases)+" ",true);
 			for(int k=0;k<readDataArray[i].readBases.length;++k)
-			    debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].readQuals[k]))+" ",true);
+			    debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].readQuals[k]))+" ",true);
 			for(int k=0;k<readDataArray[i].readBases.length;++k)
-			    debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].insertionGOP[k]))+" ",true);
+			    debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].insertionGOP[k]))+" ",true);
 			for(int k=0;k<readDataArray[i].readBases.length;++k)
-			    debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].deletionGOP[k]))+" ",true);
+			    debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].deletionGOP[k]))+" ",true);
 			for(int k=0;k<readDataArray[i].readBases.length;++k)
-			    debugDump("debug_dump.log",String.format("%d",(int)(readDataArray[i].overallGCP[k]))+" ",true);
-			debugDump("debug_dump.log","\n",true);
-			debugDump("debug_results.log",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
+			    debugDump("debug_dump.txt",String.format("%d",(int)(readDataArray[i].overallGCP[k]))+" ",true);
+			debugDump("debug_dump.txt","\n",true);
+			debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
 			++idx;
 		    }
 		}
@@ -271,10 +347,12 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	}
 	++numComputeLikelihoodCalls;
 	//if(numComputeLikelihoodCalls == 5)
+	//jniClose();
 	//System.exit(0);
 	return likelihoodMap;
     }
 
+
     /**
      * {@inheritDoc}
      */
@@ -296,11 +374,11 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	//}
 	//System.out.println("#### END STACK TRACE ####");
 	//
-      if(debug1)
-	jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length,
-	    readBases, haplotypeBases, readQuals,
-	    insertionGOP, deletionGOP, overallGCP,
-	    hapStartIndex);
+	if(debug1)
+	    jniSubComputeReadLikelihoodGivenHaplotypeLog10(readBases.length, haplotypeBases.length,
+		    readBases, haplotypeBases, readQuals,
+		    insertionGOP, deletionGOP, overallGCP,
+		    hapStartIndex);
 
 	boolean doInitialization = (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length);
 	if (doInitialization) {
@@ -355,8 +433,8 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	++numCalls;
 	//if(numCalls == 100)
 	//{
-	    //debugClose();
-	    //System.exit(0);
+	//debugClose();
+	//System.exit(0);
 	//}
 	return Math.log10(finalSumProbabilities) - INITIAL_CONDITION_LOG10;
     }
diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
index 3b4498776..3a211ba2a 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
@@ -100,6 +100,26 @@ public abstract class PairHMM {
         initialized = true;
     }
 
+    /**
+     * Called at the end of PairHMM for a region - mostly used by the JNI implementations
+     */
+    public void finalizeRegion()
+    {
+	;
+    }
+
+    /**
+     * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths
+     * This function is used by the JNI implementations to transfer all data once to the native code
+     * @param haplotypes the list of haplotypes
+     * @param perSampleReadList map from sample name to list of reads
+     * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM
+     * @param readMaxLength the max length of reads we want to use with this PairHMM
+     */
+    public void initialize( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) {
+	initialize(readMaxLength, haplotypeMaxLength); 
+    }
+
     protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
         int listMaxReadLength = 0;
         for(GATKSAMRecord read : reads){
@@ -280,4 +300,7 @@ public abstract class PairHMM {
 
         return Math.min(haplotype1.length, haplotype2.length);
     }
+
+    //Called at the end of all HC calls
+    public void close() { ; }
 }