Added support for dynamic selection between AVX and un-vectorized C++,

still to include SSE code from Mohammad. Debug flags turned on in this commit.
2014-01-18 11:07:23 -08:00 · 2014-01-18 11:07:23 -08:00 · 25aecb96e0
parent f1c772ceea
commit 25aecb96e0
16 changed files with 399 additions and 212 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,3 +24,16 @@ dump/
 lib/
 out/
 /atlassian-ide-plugin.xml
+kg_tmp/
+maven-ant-tasks-2.1.3.jar
+null-sequenceGraph.25.0.0.raw_readthreading_graph.dot
+null-sequenceGraph.25.0.1.cleaned_readthreading_graph.dot
+null-sequenceGraph.25.0.1.initial_seqgraph.dot
+null-sequenceGraph.3.0.0.raw_readthreading_graph.dot
+null-sequenceGraph.3.0.1.cleaned_readthreading_graph.dot
+null-sequenceGraph.3.0.1.initial_seqgraph.dot
+org/
+package-list
+resources/
+velocity.log
+
--- a/PairHMM_JNI/Makefile
+++ b/PairHMM_JNI/Makefile
@ -7,40 +7,59 @@ OMPCFLAGS=-fopenmp
 JAVA_ROOT=/opt/jdk1.7.0_25/
 JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JAVA_ROOT}/include -I${JAVA_ROOT}/include/linux

-CFLAGS=-O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas -xAVX
-
-CXXFLAGS=$(CFLAGS)
+COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
 CC=icc
 CXX=icc

 LDFLAGS=-lm $(OMPLDFLAGS)

-#BIN:=pairhmm-1-base #pairhmm-2-omp pairhmm-3-hybrid-float-double pairhmm-4-hybrid-diagonal pairhmm-5-hybrid-diagonal-homogeneus pairhmm-6-onlythreediags pairhmm-7-presse pairhmm-8-sse #pairhmm-dev
-BIN:=libJNILoglessPairHMM.so pairhmm-template-main checker
+BIN=libJNILoglessPairHMM.so pairhmm-template-main checker
+#BIN=checker

-#SOURCES=pairhmm-1-base.cc input.cc 
-LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc convert_char.cc
-SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
-LIBOBJECTS=$(LIBSOURCES:.cc=.o)
 DEPDIR=.deps
 DF=$(DEPDIR)/$(*).d

+#Common across libJNI and sandbox
+COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc
+#Part of libJNI
+LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc $(COMMON_SOURCES)
+SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc
+LIBOBJECTS=$(LIBSOURCES:.cc=.o)
+COMMON_OBJECTS=$(COMMON_SOURCES:.cc=.o)
+
+
+#No vectorization for these files
+NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc pairhmm-template-main.cc pairhmm-1-base.cc utils.cc baseline.cc
+#Use -xAVX for these files
+AVX_SOURCES=avx_function_instantiations.cc
+#Use -xSSE4.2 for these files
+SSE_SOURCES=
+
+NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
+AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
+SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
+$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
+$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
+$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
+OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)
+
 all: $(BIN)

 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))

-checker: pairhmm-1-base.o convert_char.o
+checker: pairhmm-1-base.o $(COMMON_OBJECTS)
 	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)

-pairhmm-template-main:	pairhmm-template-main.o convert_char.o
+pairhmm-template-main:	pairhmm-template-main.o $(COMMON_OBJECTS)
 	$(CXX) $(OMPCFLAGS) -o $@ $^ $(LDFLAGS)

 libJNILoglessPairHMM.so: $(LIBOBJECTS) 
 	$(CXX) $(OMPCFLAGS) -shared -o $@ $(LIBOBJECTS) 

-%.o: %.cc
+
+$(OBJECTS): %.o: %.cc
 	@mkdir -p $(DEPDIR)
-	$(COMPILE.cpp) -MMD -MF $(DF) $(JNI_COMPILATION_FLAGS) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+	$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<


 clean:
--- a/PairHMM_JNI/avx_function_instantiations.cc
+++ b/PairHMM_JNI/avx_function_instantiations.cc
@ -0,0 +1,12 @@
+#include "template.h"
+
+#include "define-float.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+#include "define-double.h"
+#include "shift_template.c"
+#include "pairhmm-template-kernel.cc"
+
+template double compute_full_prob_avxd<double>(testcase* tc, double* nextlog);
+template float compute_full_prob_avxs<float>(testcase* tc, float* nextlog);
--- a/PairHMM_JNI/avx_function_prototypes.h
+++ b/PairHMM_JNI/avx_function_prototypes.h
@ -0,0 +1,19 @@
+void GEN_INTRINSIC(_vector_shift, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
+void GEN_INTRINSIC(_vector_shift_last, PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
+void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
+void GEN_INTRINSIC(init_masks_for_row_, PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
+void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
+void GEN_INTRINSIC(computeDistVec, PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, _256_TYPE& distm, _256_TYPE& _1_distm, _256_TYPE& distmChosen);
+template<class NUMBER> void GEN_INTRINSIC(initializeVectors, PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  _256_TYPE *p_MM, _256_TYPE *p_GAPM, _256_TYPE *p_MX, _256_TYPE *p_XX, _256_TYPE *p_MY, _256_TYPE *p_YY, _256_TYPE *distm1D);
+template<class NUMBER> void GEN_INTRINSIC(stripINITIALIZATION, PRECISION)(
+    int stripIdx, Context<NUMBER> ctx, testcase *tc, _256_TYPE &pGAPM, _256_TYPE &pMM, _256_TYPE &pMX, _256_TYPE &pXX, _256_TYPE &pMY, _256_TYPE &pYY,
+    _256_TYPE &rs, UNION_TYPE &rsN, _256_TYPE &distm, _256_TYPE &_1_distm,  _256_TYPE *distm1D, _256_TYPE N_packed256, _256_TYPE *p_MM , _256_TYPE *p_GAPM , 
+    _256_TYPE *p_MX, _256_TYPE *p_XX , _256_TYPE *p_MY, _256_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
+    UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
+_256_TYPE GEN_INTRINSIC(computeDISTM, PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, _256_TYPE rs, UNION_TYPE rsN, _256_TYPE N_packed256, 
+    _256_TYPE distm, _256_TYPE _1_distm);
+void GEN_INTRINSIC(computeMXY, PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
+    UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
+    _256_TYPE pMM, _256_TYPE pGAPM, _256_TYPE pMX, _256_TYPE pXX, _256_TYPE pMY, _256_TYPE pYY, _256_TYPE distmSel);
+template<class NUMBER> NUMBER GEN_INTRINSIC(compute_full_prob_avx, PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
+
--- a/PairHMM_JNI/baseline.cc
+++ b/PairHMM_JNI/baseline.cc
@ -0,0 +1,85 @@
+#include "headers.h"
+#include "template.h"
+
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
+{
+	int r, c;
+	int ROWS = tc->rslen + 1;
+	int COLS = tc->haplen + 1;
+
+	Context<NUMBER> ctx;
+
+	NUMBER M[MROWS][MCOLS];
+	NUMBER X[MROWS][MCOLS];
+	NUMBER Y[MROWS][MCOLS];
+	NUMBER p[MROWS][6];
+
+	p[0][MM] = ctx._(0.0);
+	p[0][GapM] = ctx._(0.0);
+	p[0][MX] = ctx._(0.0);
+	p[0][XX] = ctx._(0.0);
+	p[0][MY] = ctx._(0.0);
+	p[0][YY] = ctx._(0.0);
+	for (r = 1; r < ROWS; r++)
+	{
+		int _i = tc->i[r-1] & 127;
+		int _d = tc->d[r-1] & 127;
+		int _c = tc->c[r-1] & 127;
+		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
+		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
+		p[r][MX] = ctx.ph2pr[_i];
+		p[r][XX] = ctx.ph2pr[_c];
+		p[r][MY] = ctx.ph2pr[_d];
+		p[r][YY] = ctx.ph2pr[_c];
+		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
+		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
+	}
+
+	for (c = 0; c < COLS; c++)
+	{
+		M[0][c] = ctx._(0.0);
+		X[0][c] = ctx._(0.0);
+		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
+	}
+
+	for (r = 1; r < ROWS; r++)
+	{
+		M[r][0] = ctx._(0.0);
+		X[r][0] = X[r-1][0] * p[r][XX];
+		Y[r][0] = ctx._(0.0);
+	}
+
+	 NUMBER result = ctx._(0.0);
+
+	for (r = 1; r < ROWS; r++)
+		for (c = 1; c < COLS; c++)
+		{
+			char _rs = tc->rs[r-1];
+			char _hap = tc->hap[c-1];
+			int _q = tc->q[r-1] & 127;
+			NUMBER distm = ctx.ph2pr[_q];
+			if (_rs == _hap || _rs == 'N' || _hap == 'N')
+				distm = ctx._(1.0) - distm;
+			else
+			  distm = distm/3;
+			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
+			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
+			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
+		}
+
+	for (c = 0; c < COLS; c++)
+	{
+		result += M[ROWS-1][c] + X[ROWS-1][c];
+	}
+
+	if (before_last_log != NULL)
+		*before_last_log = result;
+
+	return result;
+	//return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
+}
+
+template double compute_full_prob<double>(testcase* tc, double* nextbuf);
+template float compute_full_prob<float>(testcase* tc, float* nextbuf);
+
--- a/PairHMM_JNI/headers.h
+++ b/PairHMM_JNI/headers.h
@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+#include <ctype.h>
+
+#include <sys/time.h>
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <omp.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+
--- a/PairHMM_JNI/libJNILoglessPairHMM.so
+++ b/PairHMM_JNI/libJNILoglessPairHMM.so
--- a/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
+++ b/PairHMM_JNI/org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.cc
@ -1,44 +1,25 @@
+#include "headers.h"
 #include <jni.h>
-#include <assert.h>
-#include <stdio.h>
 #include "org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM.h"
-#include <vector>

-#include <cstdio>
-#include <string>
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-
-
-#include <immintrin.h>
-#include <emmintrin.h>
-#include <omp.h>
-using namespace std;

 #define ENABLE_ASSERTIONS 1
-//#define DEBUG 1
+#define DO_PROFILING 1
+#define DEBUG 1
 //#define DEBUG0_1 1
 //#define DEBUG3 1

-
 #include "template.h"
+#include "utils.h"
+
+
+#include "define-float.h"
+#include "avx_function_prototypes.h"

 #include "define-double.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
+#include "avx_function_prototypes.h"

-
-
-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
+using namespace std;

 class LoadTimeInitializer
 {
@ -46,6 +27,54 @@ class LoadTimeInitializer
    LoadTimeInitializer()		//will be called when library is loaded
    {
      ConvertChar::init();
+      m_sumNumReads = 0;
+      m_sumSquareNumReads = 0;
+      m_sumNumHaplotypes = 0;
+      m_sumSquareNumHaplotypes = 0;
+      m_sumNumTestcases = 0;
+      m_sumSquareNumTestcases = 0;
+      m_num_invocations = 0;
+
+      if(is_avx_supported())
+      {
+	cout << "Using AVX accelerated implementation of PairHMM\n";
+	g_compute_full_prob_float = compute_full_prob_avxs<float>;
+	g_compute_full_prob_double = compute_full_prob_avxd<double>;
+      }
+      else
+      {
+	cout << "Using un-vectorized C++ implementation of PairHMM\n";
+	g_compute_full_prob_float = compute_full_prob<float>;
+	g_compute_full_prob_double = compute_full_prob<double>;
+      }
+      cout.flush();
+      //if(is_sse42_supported())
+      //{
+      //g_compute_full_prob_float = compute_full_prob_avxs<float>;
+      //g_compute_full_prob_double = compute_full_prob_avxd<double>;
+      //}
+    }
+    void print_profiling()
+    {
+      double mean_val;
+      cout <<"Invocations : "<<m_num_invocations<<"\n";
+      cout << "term\tsum\tsumSq\tmean\tvar\n";
+      mean_val = m_sumNumReads/m_num_invocations;
+      cout << "reads\t"<<m_sumNumReads<<"\t"<<m_sumSquareNumReads<<"\t"<<mean_val<<"\t"<<
+	(m_sumSquareNumReads/m_num_invocations)-mean_val*mean_val<<"\n";
+      mean_val = m_sumNumHaplotypes/m_num_invocations;
+      cout << "haplotypes\t"<<m_sumNumHaplotypes<<"\t"<<m_sumSquareNumHaplotypes<<"\t"<<mean_val<<"\t"<<
+	(m_sumSquareNumHaplotypes/m_num_invocations)-mean_val*mean_val<<"\n";
+      mean_val = m_sumNumTestcases/m_num_invocations;
+      cout << "numtestcases\t"<<m_sumNumTestcases<<"\t"<<m_sumSquareNumTestcases<<"\t"<<mean_val<<"\t"<<
+	(m_sumSquareNumTestcases/m_num_invocations)-mean_val*mean_val<<"\n";
+      cout.flush();
+    }
+    ~LoadTimeInitializer()
+    {
+#ifdef DO_PROFILING
+      print_profiling();
+#endif
    }
    jfieldID m_readBasesFID;
    jfieldID m_readQualsFID;
@ -53,18 +82,18 @@ class LoadTimeInitializer
    jfieldID m_deletionGOPFID;
    jfieldID m_overallGCPFID;
    jfieldID m_haplotypeBasesFID;
+    //used to compute avg, variance of #testcases
+    double m_sumNumReads;
+    double m_sumSquareNumReads;
+    double m_sumNumHaplotypes;
+    double m_sumSquareNumHaplotypes;
+    double m_sumNumTestcases;
+    double m_sumSquareNumTestcases;
+    unsigned m_num_invocations;
+
 };
 LoadTimeInitializer g_load_time_initializer;

-void debug_dump(string filename, string s, bool to_append, bool add_newline)
-{
-  ofstream fptr;
-  fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
-  fptr << s;
-  if(add_newline)
-    fptr << "\n";
-  fptr.close();
-}

 JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPairHMM_jniInitializeProbabilities
 (JNIEnv* env, jclass thisObject,
@ -327,8 +356,15 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
 #pragma omp parallel for schedule (dynamic,10) private(tc_idx) num_threads(maxNumThreadsToUse) 
  for(tc_idx=0;tc_idx<numTestCases;++tc_idx)
  {
-    double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&(tc_array[tc_idx]));
-    double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
+    float result_avxf = g_compute_full_prob_float(&(tc_array[tc_idx]), 0);
+    double result = 0;
+    if (result_avxf < MIN_ACCEPTED) {
+      double result_avxd = g_compute_full_prob_double(&(tc_array[tc_idx]), 0);
+      result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+    }
+    else
+      result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
+
    likelihoodDoubleArray[tc_idx] = result;
  }
 #ifdef DEBUG
@ -353,5 +389,14 @@ JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_JNILoglessPai
    RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RELEASE_MODE);
  haplotypeBasesArrayVector.clear();
  tc_array.clear();
+#ifdef DO_PROFILING
+  g_load_time_initializer.m_sumNumReads += numReads;
+  g_load_time_initializer.m_sumSquareNumReads += numReads*numReads;
+  g_load_time_initializer.m_sumNumHaplotypes += numHaplotypes;
+  g_load_time_initializer.m_sumSquareNumHaplotypes += numHaplotypes*numHaplotypes;
+  g_load_time_initializer.m_sumNumTestcases += numTestCases;
+  g_load_time_initializer.m_sumSquareNumTestcases += numTestCases*numTestCases;
+  ++(g_load_time_initializer.m_num_invocations);
+#endif
 }

--- a/PairHMM_JNI/pairhmm-1-base.cc
+++ b/PairHMM_JNI/pairhmm-1-base.cc
@ -1,29 +1,15 @@
 //#define DEBUG 1
 //#define DEBUG0_1 1
 //#define DEBUG3 1
-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
-
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <omp.h>
-#include <emmintrin.h>
+#include "headers.h"
 #include "template.h"
+#include "utils.h"

-//#include "define-float.h"
-//#include "shift_template.c"
-//#include "pairhmm-template-kernel.cc"
+#include "define-float.h"
+#include "avx_function_prototypes.h"

 #include "define-double.h"
-#include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
-
+#include "avx_function_prototypes.h"

 using namespace std;
 class LoadTimeInitializer
@ -36,85 +22,6 @@ class LoadTimeInitializer
 };
 LoadTimeInitializer g_load_time_initializer;

-
-template<class NUMBER>
-NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log = NULL)
-{
-	int r, c;
-	int ROWS = tc->rslen + 1;
-	int COLS = tc->haplen + 1;
-
-	Context<NUMBER> ctx;
-
-	NUMBER M[MROWS][MCOLS];
-	NUMBER X[MROWS][MCOLS];
-	NUMBER Y[MROWS][MCOLS];
-	NUMBER p[MROWS][6];
-
-	p[0][MM] = ctx._(0.0);
-	p[0][GapM] = ctx._(0.0);
-	p[0][MX] = ctx._(0.0);
-	p[0][XX] = ctx._(0.0);
-	p[0][MY] = ctx._(0.0);
-	p[0][YY] = ctx._(0.0);
-	for (r = 1; r < ROWS; r++)
-	{
-		int _i = tc->i[r-1] & 127;
-		int _d = tc->d[r-1] & 127;
-		int _c = tc->c[r-1] & 127;
-		p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127];
-		p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c];
-		p[r][MX] = ctx.ph2pr[_i];
-		p[r][XX] = ctx.ph2pr[_c];
-		p[r][MY] = ctx.ph2pr[_d];
-		p[r][YY] = ctx.ph2pr[_c];
-		//p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d];
-		//p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c];
-	}
-
-	for (c = 0; c < COLS; c++)
-	{
-		M[0][c] = ctx._(0.0);
-		X[0][c] = ctx._(0.0);
-		Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen);
-	}
-
-	for (r = 1; r < ROWS; r++)
-	{
-		M[r][0] = ctx._(0.0);
-		X[r][0] = X[r-1][0] * p[r][XX];
-		Y[r][0] = ctx._(0.0);
-	}
-
-	 NUMBER result = ctx._(0.0);
-
-	for (r = 1; r < ROWS; r++)
-		for (c = 1; c < COLS; c++)
-		{
-			char _rs = tc->rs[r-1];
-			char _hap = tc->hap[c-1];
-			int _q = tc->q[r-1] & 127;
-			NUMBER distm = ctx.ph2pr[_q];
-			if (_rs == _hap || _rs == 'N' || _hap == 'N')
-				distm = ctx._(1.0) - distm;
-			else
-			  distm = distm/3;
-			M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]);
-			X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX];
-			Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY];
-		}
-
-	for (c = 0; c < COLS; c++)
-	{
-		result += M[ROWS-1][c] + X[ROWS-1][c];
-	}
-
-	if (before_last_log != NULL)
-		*before_last_log = result;
-
-	return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT;
-}
-
 #define BATCH_SIZE  10000
 #define RUN_HYBRID

@ -129,45 +36,55 @@ int main(int argc, char** argv)
  if(argc >= 3 && string(argv[2]) == "1")
    use_old_read_testcase = true;

-  testcase tc;
-  if(use_old_read_testcase)
+  if(true)
  {
-    FILE* fptr = fopen(argv[1],"r");
-    while(!feof(fptr))
-    {
-      if(read_testcase(&tc, fptr) >= 0)
-      {
-	double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
-	double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
-
-	cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
-	delete tc.rs;
-	delete tc.hap;
-      }
-    }
-    fclose(fptr);
+    g_compute_full_prob_double = GEN_INTRINSIC(compute_full_prob_avx, d)<double>;
+    g_compute_full_prob_float = GEN_INTRINSIC(compute_full_prob_avx, s)<float>;
+  }
+  else
+  {
+    g_compute_full_prob_double = compute_full_prob<double>;
+    g_compute_full_prob_float = compute_full_prob<float>;
+  }
+
+  std::ifstream ifptr;
+  FILE* fptr = 0;
+  if(use_old_read_testcase)
+  {
+    fptr = fopen(argv[1],"r");
+    assert(fptr);
  }
  else
  {
-    std::ifstream ifptr;
-    std::vector<std::string> tokens;
    ifptr.open(argv[1]);
    assert(ifptr.is_open());
-    while(1)
-    {
-      tokens.clear();
-      if(read_mod_testcase(ifptr, &tc, false) < 0)
-	break;
-      //double result = 0;
-      double result_avxd = GEN_INTRINSIC(compute_full_prob_avx, d)<double>(&tc);
-      double result = log10(result_avxd) - log10(ldexp(1.0, 1020));
-
-      cout << std::scientific << compute_full_prob<double>(&tc) << " "<<result<<"\n";
-      delete tc.rs;
-      delete tc.hap;
-    }
-    ifptr.close();
  }
+
+  testcase tc;
+  while(1)
+  {
+    int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true);
+    if(break_value < 0)
+      break;
+    float result_avxf = g_compute_full_prob_float(&tc, 0);
+    double result = 0;
+    if (result_avxf < MIN_ACCEPTED) {
+      double result_avxd = g_compute_full_prob_double(&tc, 0);
+      result = log10(result_avxd) - log10(ldexp(1.0, 1020.0));
+    }
+    else
+      result = (double)(log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)));
+
+    double baseline_result = compute_full_prob<double>(&tc);
+    baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0)); 
+    cout << std::scientific << baseline_result << " "<<result<<"\n";
+    delete tc.rs;
+    delete tc.hap;
+  }
+  if(use_old_read_testcase)
+    fclose(fptr);
+  else
+    ifptr.close();
  return 0;  

 #if 0
--- a/PairHMM_JNI/pairhmm-template-kernel.cc
+++ b/PairHMM_JNI/pairhmm-template-kernel.cc
@ -34,14 +34,14 @@ void GEN_INTRINSIC(precompute_masks_, PRECISION)(const testcase& tc, int COLS, i
    int mOffset = (col-1) % maskBitCnt ;
    MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ;

-    char hapChar = tc.hap[col-1] ;
+    char hapChar = ConvertChar::get(tc.hap[col-1]);

    if (hapChar == AMBIG_CHAR) {
      for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) 
 	maskArr[mIndex][ci] |= bitMask ;
    } 

-    maskArr[mIndex][ConvertChar::get(hapChar)] |= bitMask ;
+    maskArr[mIndex][hapChar] |= bitMask ;
    // bit corresponding to col 1 will be the MSB of the mask 0
    // bit corresponding to col 2 will be the MSB-1 of the mask 0
    // ...
@ -75,11 +75,11 @@ void GEN_INTRINSIC(update_masks_for_cols_, PRECISION)(int maskIndex, MASK_VEC& c

  for (int ei=0; ei < AVX_LENGTH/2; ++ei) {
    SET_MASK_WORD(currMaskVecLow.masks[ei], maskArr[maskIndex][rsArr[ei]], 
-		  lastMaskShiftOut[ei], ei, maskBitCnt) ;
-    
+	lastMaskShiftOut[ei], ei, maskBitCnt) ;
+
    int ei2 = ei + AVX_LENGTH/2 ; // the second entry index
    SET_MASK_WORD(currMaskVecHigh.masks[ei], maskArr[maskIndex][rsArr[ei2]], 
-		  lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
+	lastMaskShiftOut[ei2], ei2, maskBitCnt) ;
  }

 }
--- a/PairHMM_JNI/pairhmm-template-main.cc
+++ b/PairHMM_JNI/pairhmm-template-main.cc
@ -1,4 +1,5 @@
 #include <stdio.h>
+#include "headers.h"
 #include <immintrin.h>
 #include <emmintrin.h>
 #include <omp.h>
@ -7,11 +8,11 @@

 #include "define-float.h"
 #include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
+#include "avx_function_prototypes.h"

 #include "define-double.h"
 #include "shift_template.c"
-#include "pairhmm-template-kernel.cc"
+#include "avx_function_prototypes.h"

 #define BATCH_SIZE  10000
 //#define RUN_HYBRID
--- a/PairHMM_JNI/run.sh
+++ b/PairHMM_JNI/run.sh
@ -3,8 +3,8 @@ rm -f *.txt
 export GSA_ROOT_DIR=/home/karthikg/broad/gsa-unstable
 #-Djava.library.path is needed if you are using JNI_LOGLESS_CACHING, else not needed
 java  -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar ${GSA_ROOT_DIR}/dist/GenomeAnalysisTK.jar  -T HaplotypeCaller \
-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
-I /data/simulated/sim1M_pairs_final.bam \
+-R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \
+-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
 --dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
 -stand_call_conf 50.0 \
 -stand_emit_conf 10.0 \
@ -14,3 +14,8 @@ java  -Djava.library.path=${GSA_ROOT_DIR}/PairHMM_JNI -jar ${GSA_ROOT_DIR}/dist/
 #--pair_hmm_implementation JNI_LOGLESS_CACHING \
 #-I /data/simulated/sim1M_pairs_final.bam \
 #-I /data/broad/samples/joint_variant_calling/NA12878_low_coverage_alignment/NA12878.chrom11.ILLUMINA.bwa.CEU.low_coverage.20121211.bam \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/Homo_sapiens_assembly19.fasta \
+#-R /data/broad/samples/joint_variant_calling/broad_reference/ucsc.hg19.fasta \
+#-R /opt/Genomics/ohsu/dnapipeline/humanrefgenome/human_g1k_v37.fasta \
+#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/00-All.vcf \
+#--dbsnp /data/broad/samples/joint_variant_calling/dbSNP/dbsnp_138.hg19.vcf \
--- a/PairHMM_JNI/template.h
+++ b/PairHMM_JNI/template.h
@ -1,24 +1,14 @@
 #ifndef TEMPLATES_H_
 #define TEMPLATES_H_

-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stdint.h>
-#include <assert.h>
-
-#include <sys/time.h>
-
-#include <immintrin.h>
-
-#include <ctype.h>
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <vector>
+#include "headers.h"

+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5

 #define MROWS  500
 #define MCOLS  1000
@ -130,6 +120,7 @@ struct Context<float>
 };


+
 typedef struct
 {
        int rslen, haplen;
@ -182,7 +173,8 @@ public:
    return conversionTable[input] ;
  }

-} ;
+};
+


 #endif
--- a/PairHMM_JNI/convert_char.cc
+++ b/PairHMM_JNI/convert_char.cc
@ -1,7 +1,36 @@
+#include "headers.h"
 #include "template.h"
+
 uint8_t ConvertChar::conversionTable[255];
+float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
+double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
+
 using namespace std;

+bool is_avx_supported()
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__("cpuid"
+      : "=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      : "a" (1)
+      );
+  return ((ecx >> 28)&1) == 1;
+}
+
+bool is_sse42_supported()
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__("cpuid"
+      : "=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      : "a" (1)
+      );
+  return ((ecx >> 20)&1) == 1;
+}
+
 int normalize(char c)
 {
 	return ((int) (c - 33));
@ -184,3 +213,13 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)

  return tokens.size();
 }
+
+void debug_dump(string filename, string s, bool to_append, bool add_newline)
+{
+  ofstream fptr;
+  fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out);
+  fptr << s;
+  if(add_newline)
+    fptr << "\n";
+  fptr.close();
+}
--- a/PairHMM_JNI/utils.h
+++ b/PairHMM_JNI/utils.h
@ -0,0 +1,13 @@
+#ifndef PAIRHMM_UTIL_H
+#define PAIRHMM_UTIL_H
+
+#define MIN_ACCEPTED 1e-28f
+bool is_avx_supported();
+bool is_sse42_supported();
+extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log);
+extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log);
+void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
+template<class NUMBER>
+NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
+
+#endif
--- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
@ -66,13 +66,13 @@ import java.util.Map;
 */
 public class JNILoglessPairHMM extends LoglessPairHMM {

-    private static final boolean debug = false;	 //simulates ifdef
+    private static final boolean debug = true;	 //simulates ifdef
    private static final boolean debug0_1 = false;	 //simulates ifdef
    private static final boolean debug1 = false; //simulates ifdef
    private static final boolean debug2 = false;
    private static final boolean debug3 = false;
    private int numComputeLikelihoodCalls = 0;
-
+    
    //Used to copy references to byteArrays to JNI from reads
    protected class JNIReadDataHolderClass
    {
@ -138,6 +138,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	  jniInitialize(readMaxLength, haplotypeMaxLength);
    }

+
    //Real compute kernel
    private native void jniComputeLikelihoods(int numReads, int numHaplotypes,
 	    JNIReadDataHolderClass[] readDataArray, JNIHaplotypeDataHolderClass[] haplotypeDataArray,
@ -160,6 +161,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	}
 	int readListSize = reads.size();
 	int alleleHaplotypeMapSize = alleleHaplotypeMap.size();
+	int numTestcases = readListSize*alleleHaplotypeMapSize;
 	if(debug0_1)
 	    System.out.println("Java numReads "+readListSize+" numHaplotypes "+alleleHaplotypeMapSize);
 	JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize];
@ -275,6 +277,7 @@ public class JNILoglessPairHMM extends LoglessPairHMM {
 	return likelihoodMap;
    }

+
    /**
     * {@inheritDoc}
     */