Parallel version of the JNI for the PairHMM

The JNI treats shared memory as critical memory and doesn't allow any parallel reads or writes to it until the native code finishes. This is not a problem *per se* it is the right thing to do, but we need to enable **-nct** when running the haplotype caller and with it have multiple native PairHMM running for each map call. Move to a copy based memory sharing where the JNI simply copies the memory over to C++ and then has no blocked critical memory when running, allowing -nct to work. This version is slightly (almost unnoticeably) slower with -nct 1, but scales better with -nct 2-4 (we haven't tested anything beyond that because we know the GATK falls apart with higher levels of parallelism * Make VECTOR_LOGLESS_CACHING the default implementation for PairHMM. * Changed version number in pom.xml under public/VectorPairHMM * VectorPairHMM can now be compiled using gcc 4.8.x * Modified define-* to get rid of gcc warnings for extra tokens after #undefs * Added a Linux kernel version check for AVX - gcc's __builtin_cpu_supports function does not check whether the kernel supports AVX or not. * Updated PairHMM profiling code to update and print numbers only in single-thread mode * Edited README.md, pom.xml and Makefile for users to pass path to gcc 4.8.x if necessary * Moved all cpuid inline assembly to single function Changed info message to clog from cinfo * Modified version in pom.xml in VectorPairHMM from 3.1 to 3.2 * Deleted some unnecessary code * Modified C++ sandbox to print per interval timing
2014-03-17 11:42:19 -07:00 · 2014-03-17 11:42:19 -07:00 · f6ea25b4d1
parent 38b7cfbda9
commit f6ea25b4d1
31 changed files with 548 additions and 527 deletions
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -417,7 +417,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
     */
    @Hidden
    @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
-    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
+    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING;

    @Hidden
    @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
@ -639,6 +639,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
            logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
        }

+        //static member function - set number of threads
+        PairHMM.setNumberOfThreads(getToolkit().getTotalNumberOfThreads());
        // create our likelihood calculation engine
        likelihoodCalculationEngine = createLikelihoodCalculationEngine();

--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
@ -58,6 +58,7 @@ import java.util.HashMap;
 */
 public abstract class JNILoglessPairHMM extends LoglessPairHMM {
    public abstract HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap();
-    protected long setupTime = 0;
+    protected long threadLocalSetupTimeDiff = 0;
+    protected static long pairHMMSetupTime = 0;
    
 }
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java
@ -228,7 +228,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {

        mLikelihoodArray = new double[readListSize*numHaplotypes];      //to store results
        if(doProfiling)
-            setupTime += (System.nanoTime() - startTime);
+            threadLocalSetupTimeDiff = (System.nanoTime() - startTime);
        //for(reads)
        //   for(haplotypes)
        //       compute_full_prob()
@ -251,7 +251,14 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
            readIdx += numHaplotypes;
        }
        if(doProfiling)
-            computeTime += (System.nanoTime() - startTime);
+        {
+            threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
+            //synchronized(doProfiling)
+            {
+                pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
+                pairHMMSetupTime += threadLocalSetupTimeDiff;
+            }
+        }
        return likelihoodMap;
    }
    
@ -262,7 +269,8 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
    @Override
    public void close()
    {
-        System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9));
+        if(doProfiling)
+            System.out.println("Time spent in setup for JNI call : "+(pairHMMSetupTime*1e-9));
        super.close();
        jniClose();
    }
--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
        final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);

        final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
-        final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("4ccdbebcfd02be87ae5b4ad94666f011"));
+        final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("214799b5d1c0809f042c557155ea6e13"));
        specAnn.disableShadowBCF();
        final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);

--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa

    @Test
    public void testHaplotypeCallerMultiSampleComplex1() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "65c316f1f3987d7bc94e887999920d45");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d1626d5fff419b8a782de954224e881f");
    }

    private void HCTestSymbolicVariants(String bam, String args, String md5) {
--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    public void HCTestConservativePcrIndelModelWGS() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
-                Arrays.asList("7592274ecd2b5ef4624dd2ed659536ef"));
+                Arrays.asList("1b41bf83b9c249a9a5fffb1e308668c1"));
        executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
    }

--- a/public/VectorPairHMM/README.md
+++ b/public/VectorPairHMM/README.md
@ -44,9 +44,10 @@ end of every region (line 351 in PairHMMLikelihoodCalculationEngine).
 Note: Debug code has been moved to a separate class DebugJNILoglessPairHMM.java.

 Compiling:
-Make sure you have icc (Intel C compiler) available. Currently, gcc does not seem to support all AVX 
-intrinsics.
-This native library is called libVectorLoglessPairHMM.so
+The native library (called libVectorLoglessPairHMM.so) can be compiled with icc (Intel C compiler) 
+or gcc versions >= 4.8.1 that support AVX intrinsics. By default, the make process tries to invoke 
+icc. To use gcc, edit the file 'pom.xml' (in this directory) and enable the environment variables 
+USE_GCC,C_COMPILER and CPP_COMPILER (edit and uncomment lines 60-62).
 Using Maven:
 Type 'mvn install' in this directory - this will build the library (by invoking 'make') and copy the 
 native library to the directory 
--- a/public/VectorPairHMM/pom.xml
+++ b/public/VectorPairHMM/pom.xml
@ -5,7 +5,7 @@
    <parent>
        <groupId>org.broadinstitute.sting</groupId>
        <artifactId>sting-root</artifactId>
-        <version>2.8-SNAPSHOT</version>
+        <version>3.2-SNAPSHOT</version>
        <relativePath>../../public/sting-root</relativePath>
    </parent>

@ -57,6 +57,9 @@
                            <workingDirectory>src/main/c++</workingDirectory>
                            <environmentVariables>
                                <JRE_HOME>${java.home}</JRE_HOME>
+                                <!--<USE_GCC>1</USE_GCC>-->
+                                <!--<C_COMPILER>/opt/gcc-4.8.2/bin/gcc</C_COMPILER>-->
+                                <!--<CPP_COMPILER>/opt/gcc-4.8.2/bin/g++</CPP_COMPILER>-->
                                <OUTPUT_DIR>${project.build.directory}</OUTPUT_DIR>
                            </environmentVariables>
                        </configuration>
--- a/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc
+++ b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc
@ -23,8 +23,8 @@
 */


-#include "LoadTimeInitializer.h"
 #include "utils.h"
+#include "LoadTimeInitializer.h"
 using namespace std;
 char* LoadTimeInitializerStatsNames[] = 
 {
@ -43,6 +43,10 @@ LoadTimeInitializer g_load_time_initializer;

 LoadTimeInitializer::LoadTimeInitializer()		//will be called when library is loaded
 {
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__INTEL_COMPILER)
+  //compiles only with gcc >= 4.8
+  __builtin_cpu_init();
+#endif
  ConvertChar::init();
 #ifndef DISABLE_FTZ
  //Very important to get good performance on Intel processors
@ -71,11 +75,6 @@ LoadTimeInitializer::LoadTimeInitializer()		//will be called when library is loa
  m_filename_to_fptr.clear();
  m_written_files_set.clear();

-  //Common buffer - 8MB
-  unsigned size = 1024*1024;
-  m_buffer = new uint64_t[size];
-  m_buffer_size = size*sizeof(uint64_t);
-  
  initialize_function_pointers();

  //Initialize static members of class
--- a/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h
+++ b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h
@ -27,7 +27,7 @@
 #define LOAD_TIME_INITIALIZER_H
 #include "headers.h"
 #include <jni.h>
-#include "template.h"
+/*#include "template.h"*/

 enum LoadTimeInitializerStatsEnum
 {
@ -47,10 +47,6 @@ class LoadTimeInitializer
 {
  public:
    LoadTimeInitializer();		//will be called when library is loaded
-    ~LoadTimeInitializer()
-    {
-      delete[] m_buffer;
-    }
    void print_profiling();
    void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
    void debug_close();
@ -72,8 +68,6 @@ class LoadTimeInitializer
    uint64_t m_data_transfer_time;
    //bytes copied
    uint64_t m_bytes_copied;
-    unsigned get_buffer_size() { return m_buffer_size; }
-    char* get_buffer() { return (char*)m_buffer; }
  private:
    std::map<std::string, std::ofstream*> m_filename_to_fptr;
    std::set<std::string> m_written_files_set;
@ -83,8 +77,6 @@ class LoadTimeInitializer
    double m_sum_square_stats[TOTAL_NUMBER_STATS];
    uint64_t m_min_stats[TOTAL_NUMBER_STATS];
    uint64_t m_max_stats[TOTAL_NUMBER_STATS];
-    unsigned m_buffer_size;
-    uint64_t* m_buffer;
 };
 extern LoadTimeInitializer g_load_time_initializer;

--- a/public/VectorPairHMM/src/main/c++/Makefile
+++ b/public/VectorPairHMM/src/main/c++/Makefile
@ -32,14 +32,30 @@
 JRE_HOME?=/opt/jdk1.7.0_25/jre
 JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JRE_HOME}/../include -I${JRE_HOME}/../include/linux

-COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
-CC=icc
-CXX=icc
+#COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -Wall $(OMPCFLAGS) -Wno-unknown-pragmas -Wno-write-strings -Wno-unused-variable -Wno-unused-but-set-variable
+ifdef DISABLE_FTZ
+  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ
+endif
+
+ifdef USE_GCC
+  C_COMPILER?=gcc
+  CPP_COMPILER?=g++
+  AVX_FLAGS=-mavx
+  SSE41_FLAGS=-msse4.1
+  COMMON_COMPILATION_FLAGS+=-Wno-char-subscripts
+else
+  C_COMPILER?=icc
+  CPP_COMPILER?=icc
+  AVX_FLAGS=-xAVX
+  SSE41_FLAGS=-xSSE4.1
+  LIBFLAGS=-static-intel
+  ifdef DISABLE_FTZ
+    COMMON_COMPILATION_FLAGS+=-no-ftz
+  endif
+endif

 LDFLAGS=-lm -lrt $(OMPLDFLAGS)
-ifdef DISABLE_FTZ
-  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
-endif

 PAPI_DIR=/home/karthikg/softwares/papi-5.3.0
 ifdef USE_PAPI
@ -49,10 +65,6 @@ ifdef USE_PAPI
  endif
 endif

-ifdef DISABLE_FTZ
-  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
-endif
-
 BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker
 #BIN=checker

@ -79,8 +91,8 @@ NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
 AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
 SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
 $(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
-$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
-$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
+$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(AVX_FLAGS)
+$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(SSE41_FLAGS)
 OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)

 all: $(BIN) Sandbox.class copied_lib
@ -88,18 +100,18 @@ all: $(BIN) Sandbox.class copied_lib
 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))

 checker: pairhmm-1-base.o $(COMMON_OBJECTS)
-	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
+	$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)

 pairhmm-template-main:	pairhmm-template-main.o $(COMMON_OBJECTS)
-	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
+	$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)

 libVectorLoglessPairHMM.so: $(LIBOBJECTS) 
-	$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
+	$(CPP_COMPILER) $(OMPLFLAGS) -shared $(LIBFLAGS) -o $@ $(LIBOBJECTS) ${LDFLAGS}


 $(OBJECTS): %.o: %.cc
 	@mkdir -p $(DEPDIR)
-	$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+	$(CPP_COMPILER) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<

 Sandbox.class: Sandbox.java
 	javac Sandbox.java
--- a/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc
+++ b/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc
@ -23,7 +23,6 @@
 */


-#include "template.h"

 #undef SIMD_ENGINE
 #undef SIMD_ENGINE_SSE
@ -31,6 +30,8 @@
 #define SIMD_ENGINE avx
 #define SIMD_ENGINE_AVX

+#include "template.h"
+
 #include "define-float.h"
 #include "shift_template.c"
 #include "pairhmm-template-kernel.cc"
--- a/public/VectorPairHMM/src/main/c++/baseline.cc
+++ b/public/VectorPairHMM/src/main/c++/baseline.cc
@ -24,7 +24,7 @@


 #include "headers.h"
-#include "template.h"
+#include "common_data_structure.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"
 using namespace std;
@ -48,17 +48,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
  //a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
  //the chance of being stored together in the cache. Also, prefetchers can learn memory access
  //patterns for 2D arrays, not possible for array of pointers
-  //bool locally_allocated = false;
  //NUMBER* common_buffer = 0;
  NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
-  //unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
-  //if(true)
-  //{
-  //common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
-  //locally_allocated = true;
-  //}
-  //else
-  //common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer()); 
  //pointers to within the allocated buffer
  NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
  NUMBER* ptr = common_buffer;
@ -154,7 +145,6 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)

 #ifndef USE_STACK_ALLOCATION
  delete[] common_pointer_buffer;
-  //if(locally_allocated)
  delete[] common_buffer;
 #endif

--- a/public/VectorPairHMM/src/main/c++/common_data_structure.h
+++ b/public/VectorPairHMM/src/main/c++/common_data_structure.h
@ -0,0 +1,215 @@
+#ifndef COMMON_DATA_STRUCTURE_H
+#define COMMON_DATA_STRUCTURE_H
+
+#include "headers.h"
+
+#define CAT(X,Y) X####Y
+#define CONCAT(X,Y) CAT(X,Y)
+
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+//#define MROWS  500
+//#define MCOLS  1000
+
+
+#define MAX_QUAL 254
+#define MAX_JACOBIAN_TOLERANCE 8.0
+#define JACOBIAN_LOG_TABLE_STEP 0.0001
+#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
+#define MAXN 70000
+#define LOG10_CACHE_SIZE  (4*MAXN)  // we need to be able to go up to 2*(2N) when calculating some of the coefficients
+#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
+
+template<class NUMBER>
+struct ContextBase
+{
+  public:
+    NUMBER ph2pr[128];
+    NUMBER INITIAL_CONSTANT;
+    NUMBER LOG10_INITIAL_CONSTANT;
+    NUMBER RESULT_THRESHOLD; 
+
+    static bool staticMembersInitializedFlag;
+    static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
+    static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+
+    static void initializeStaticMembers()
+    {
+      if(!staticMembersInitializedFlag)
+      {
+        //Order of calls important - Jacobian first, then MatchToMatch
+        initializeJacobianLogTable();
+        initializeMatchToMatchProb();
+        staticMembersInitializedFlag = true;
+      }
+    }
+
+    static void deleteStaticMembers()
+    {
+      if(staticMembersInitializedFlag)
+      {
+        staticMembersInitializedFlag = false;
+      }
+    }
+
+    //Called only once during library load - don't bother to optimize with single precision fp
+    static void initializeJacobianLogTable()
+    {
+      for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
+        jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
+      }
+    }
+
+    //Called only once per library load - don't bother optimizing with single fp
+    static void initializeMatchToMatchProb()
+    {
+      double LN10 = log(10);
+      double INV_LN10 = 1.0/LN10;
+      for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
+        for (int j = 0; j <= i; j++) {
+          double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
+          double matchToMatchLog10 =
+            log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
+          matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
+        }
+    }
+    //Called during computation - use single precision where possible
+    static int fastRound(NUMBER d) {
+      return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
+    }
+    //Called during computation - use single precision where possible
+    static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
+      // make sure small is really the smaller value
+      if (small > big) {
+        NUMBER t = big;
+        big = small;
+        small = t;
+      }
+
+      if (isinf(small) == -1 || isinf(big) == -1)
+        return big;
+
+      NUMBER diff = big - small;
+      if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
+        return big;
+
+      // OK, so |y-x| < tol: we use the following identity then:
+      // we need to compute log10(10^x + 10^y)
+      // By Jacobian logarithm identity, this is equal to
+      // max(x,y) + log10(1+10^-abs(x-y))
+      // we compute the second term as a table lookup with integer quantization
+      // we have pre-stored correction for 0,0.1,0.2,... 10.0
+      int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
+      return big + jacobianLogTable[ind];
+    }
+};
+
+template<class NUMBER>
+struct Context : public ContextBase<NUMBER>
+{};
+
+template<>
+struct Context<double> : public ContextBase<double>
+{
+  Context():ContextBase<double>()
+  {
+    for (int x = 0; x < 128; x++)
+      ph2pr[x] = pow(10.0, -((double)x) / 10.0);
+
+    INITIAL_CONSTANT = ldexp(1.0, 1020.0);
+    LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
+    RESULT_THRESHOLD = 0.0;
+  }
+
+  double LOG10(double v){ return log10(v); }
+  inline double POW(double b, double e) { return pow(b,e); }
+
+  static double _(double n){ return n; }
+  static double _(float n){ return ((double) n); }
+};
+
+template<>
+struct Context<float> : public ContextBase<float>
+{
+  Context() : ContextBase<float>()
+  {
+    for (int x = 0; x < 128; x++)
+    {
+      ph2pr[x] = powf(10.f, -((float)x) / 10.f);
+    }
+
+    INITIAL_CONSTANT = ldexpf(1.f, 120.f);
+    LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
+    RESULT_THRESHOLD = ldexpf(1.f, -110.f);
+  }
+
+  float LOG10(float v){ return log10f(v); }
+  inline float POW(float b, float e) { return powf(b,e); }
+
+  static float _(double n){ return ((float) n); }
+  static float _(float n){ return n; }
+};
+
+#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual)                       \
+{                                                                               \
+  int minQual = delQual;                                                        \
+  int maxQual = insQual;                                                        \
+  if (insQual <= delQual)                                                       \
+  {                                                                             \
+    minQual = insQual;                                                          \
+    maxQual = delQual;                                                          \
+  }                                                                             \
+  (output) = (MAX_QUAL < maxQual) ?                                             \
+  ((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual))       \
+  : ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual];           \
+}
+
+
+
+typedef struct
+{
+        int rslen, haplen;
+	/*int *q, *i, *d, *c;*/
+	/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
+	char *q, *i, *d, *c;
+        char *hap, *rs;
+	int *ihap;
+	int *irs;
+} testcase;
+
+#define MIN_ACCEPTED 1e-28f
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+class ConvertChar {
+
+  static uint8_t conversionTable[255] ;
+
+public:
+
+  static void init() {
+    assert (NUM_DISTINCT_CHARS == 5) ;
+    assert (AMBIG_CHAR == 4) ;
+
+    conversionTable['A'] = 0 ;
+    conversionTable['C'] = 1 ;
+    conversionTable['T'] = 2 ;
+    conversionTable['G'] = 3 ;
+    conversionTable['N'] = 4 ;
+  }
+
+  static inline uint8_t get(uint8_t input) {
+    return conversionTable[input] ;
+  }
+
+};
+
+int normalize(char c);
+int read_testcase(testcase *tc, FILE* ifp=0);
+
+#endif
--- a/public/VectorPairHMM/src/main/c++/define-double.h
+++ b/public/VectorPairHMM/src/main/c++/define-double.h
@ -42,33 +42,33 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-float.h
+++ b/public/VectorPairHMM/src/main/c++/define-float.h
@ -42,33 +42,33 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-sse-double.h
+++ b/public/VectorPairHMM/src/main/c++/define-sse-double.h
@ -40,35 +40,35 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef VEC_EXTRACT_UNIT
+#undef VEC_INSERT_UNIT
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-sse-float.h
+++ b/public/VectorPairHMM/src/main/c++/define-sse-float.h
@ -40,35 +40,35 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef VEC_EXTRACT_UNIT
+#undef VEC_INSERT_UNIT
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/jni_common.h
+++ b/public/VectorPairHMM/src/main/c++/jni_common.h
@ -37,7 +37,7 @@
 /*#define DUMP_TO_SANDBOX 1*/


-#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
+/*#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1*/

 #ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
 //Gets direct access to Java arrays
--- a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc
+++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc
@ -26,7 +26,6 @@
 #include "headers.h"
 #include "jni_common.h"
 #include "org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h"
-#include "template.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"
 #include "jnidebug.h"
--- a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
+++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
@ -26,7 +26,7 @@
 #include "headers.h"
 #include "jni_common.h"
 #include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h"
-#include "template.h"
+//#include "template.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"

--- a/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc
+++ b/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc
@ -29,7 +29,6 @@ using namespace std;

 int main(int argc, char** argv)
 {
-#define BATCH_SIZE 10000
  if(argc < 2)
  {
    cerr << "Needs path to input file as argument\n";
--- a/public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc
+++ b/public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc
@ -24,12 +24,11 @@


 #include "headers.h"
-#include "template.h"
-#include "vector_defs.h"

 #define SIMD_ENGINE avx
 #define SIMD_ENGINE_AVX

+#include "utils.h"

 #define BATCH_SIZE  10000
 #define RUN_HYBRID
--- a/public/VectorPairHMM/src/main/c++/sse_function_instantiations.cc
+++ b/public/VectorPairHMM/src/main/c++/sse_function_instantiations.cc
@ -23,7 +23,6 @@
 */


-#include "template.h"

 #undef SIMD_ENGINE
 #undef SIMD_ENGINE_AVX
@ -31,6 +30,8 @@
 #define SIMD_ENGINE sse
 #define SIMD_ENGINE_SSE

+#include "template.h"
+
 #include "define-sse-float.h"
 #include "shift_template.c"
 #include "pairhmm-template-kernel.cc"
--- a/public/VectorPairHMM/src/main/c++/template.h
+++ b/public/VectorPairHMM/src/main/c++/template.h
@ -28,27 +28,17 @@

 #include "headers.h"

-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
-
-//#define MROWS  500
-//#define MCOLS  1000
-
-#define CAT(X,Y) X####Y
-#define CONCAT(X,Y) CAT(X,Y)

 #define ALIGNED __attribute__((aligned(32)))

+#ifdef SIMD_ENGINE_AVX
 typedef union __attribute__((aligned(32))) {
        ALIGNED __m256 ALIGNED d;
        ALIGNED __m128i ALIGNED s[2];
        ALIGNED float  ALIGNED f[8];
        ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_F ALIGNED;
+#endif

 typedef union __attribute__((aligned(32))) {
        ALIGNED __m128 ALIGNED d;
@ -81,12 +71,14 @@ typedef union ALIGNED
        ALIGNED float  ALIGNED f;
 } ALIGNED IF_32 ALIGNED;

+#ifdef SIMD_ENGINE_AVX
 typedef union __attribute__((aligned(32))) {
        ALIGNED __m256d ALIGNED d;
        ALIGNED __m128i ALIGNED s[2];
        ALIGNED double  ALIGNED f[4];
        ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_D ALIGNED;
+#endif

 typedef union __attribute__((aligned(32))) {
        ALIGNED __m128d ALIGNED d;
@ -120,200 +112,7 @@ typedef union ALIGNED
 } ALIGNED IF_64 ALIGNED;


-#define MAX_QUAL 254
-#define MAX_JACOBIAN_TOLERANCE 8.0
-#define JACOBIAN_LOG_TABLE_STEP 0.0001
-#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
-#define MAXN 70000
-#define LOG10_CACHE_SIZE  (4*MAXN)  // we need to be able to go up to 2*(2N) when calculating some of the coefficients
-#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
-
-template<class NUMBER>
-struct ContextBase
-{
-  public:
-    NUMBER ph2pr[128];
-    NUMBER INITIAL_CONSTANT;
-    NUMBER LOG10_INITIAL_CONSTANT;
-    NUMBER RESULT_THRESHOLD; 
-
-    static bool staticMembersInitializedFlag;
-    static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-    static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
-
-    static void initializeStaticMembers()
-    {
-      if(!staticMembersInitializedFlag)
-      {
-        //Order of calls important - Jacobian first, then MatchToMatch
-        initializeJacobianLogTable();
-        initializeMatchToMatchProb();
-        staticMembersInitializedFlag = true;
-      }
-    }
-
-    static void deleteStaticMembers()
-    {
-      if(staticMembersInitializedFlag)
-      {
-        staticMembersInitializedFlag = false;
-      }
-    }
-
-    //Called only once during library load - don't bother to optimize with single precision fp
-    static void initializeJacobianLogTable()
-    {
-      for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
-        jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
-      }
-    }
-
-    //Called only once per library load - don't bother optimizing with single fp
-    static void initializeMatchToMatchProb()
-    {
-      double LN10 = log(10);
-      double INV_LN10 = 1.0/LN10;
-      for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
-        for (int j = 0; j <= i; j++) {
-          double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
-          double matchToMatchLog10 =
-            log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
-          matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
-        }
-    }
-    //Called during computation - use single precision where possible
-    static int fastRound(NUMBER d) {
-      return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
-    }
-    //Called during computation - use single precision where possible
-    static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
-      // make sure small is really the smaller value
-      if (small > big) {
-        NUMBER t = big;
-        big = small;
-        small = t;
-      }
-
-      if (isinf(small) == -1 || isinf(big) == -1)
-        return big;
-
-      NUMBER diff = big - small;
-      if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
-        return big;
-
-      // OK, so |y-x| < tol: we use the following identity then:
-      // we need to compute log10(10^x + 10^y)
-      // By Jacobian logarithm identity, this is equal to
-      // max(x,y) + log10(1+10^-abs(x-y))
-      // we compute the second term as a table lookup with integer quantization
-      // we have pre-stored correction for 0,0.1,0.2,... 10.0
-      int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
-      return big + jacobianLogTable[ind];
-    }
-};
-
-template<class NUMBER>
-struct Context : public ContextBase<NUMBER>
-{};
-
-template<>
-struct Context<double> : public ContextBase<double>
-{
-  Context():ContextBase<double>()
-  {
-    for (int x = 0; x < 128; x++)
-      ph2pr[x] = pow(10.0, -((double)x) / 10.0);
-
-    INITIAL_CONSTANT = ldexp(1.0, 1020.0);
-    LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
-    RESULT_THRESHOLD = 0.0;
-  }
-
-  double LOG10(double v){ return log10(v); }
-  inline double POW(double b, double e) { return pow(b,e); }
-
-  static double _(double n){ return n; }
-  static double _(float n){ return ((double) n); }
-};
-
-template<>
-struct Context<float> : public ContextBase<float>
-{
-  Context() : ContextBase<float>()
-  {
-    for (int x = 0; x < 128; x++)
-    {
-      ph2pr[x] = powf(10.f, -((float)x) / 10.f);
-    }
-
-    INITIAL_CONSTANT = ldexpf(1.f, 120.f);
-    LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
-    RESULT_THRESHOLD = ldexpf(1.f, -110.f);
-  }
-
-  float LOG10(float v){ return log10f(v); }
-  inline float POW(float b, float e) { return powf(b,e); }
-
-  static float _(double n){ return ((float) n); }
-  static float _(float n){ return n; }
-};
-
-#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual)                       \
-{                                                                               \
-  int minQual = delQual;                                                        \
-  int maxQual = insQual;                                                        \
-  if (insQual <= delQual)                                                       \
-  {                                                                             \
-    minQual = insQual;                                                          \
-    maxQual = delQual;                                                          \
-  }                                                                             \
-  (output) = (MAX_QUAL < maxQual) ?                                             \
-  ((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual))       \
-  : ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual];           \
-}
-
-typedef struct
-{
-        int rslen, haplen;
-	/*int *q, *i, *d, *c;*/
-	/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
-	char *q, *i, *d, *c;
-        char *hap, *rs;
-	int *ihap;
-	int *irs;
-} testcase;
-
-int normalize(char c);
-int read_testcase(testcase *tc, FILE* ifp=0);
-
-
-#define MIN_ACCEPTED 1e-28f
-#define NUM_DISTINCT_CHARS 5
-#define AMBIG_CHAR 4
-
-class ConvertChar {
-
-  static uint8_t conversionTable[255] ;
-
-public:
-
-  static void init() {
-    assert (NUM_DISTINCT_CHARS == 5) ;
-    assert (AMBIG_CHAR == 4) ;
-
-    conversionTable['A'] = 0 ;
-    conversionTable['C'] = 1 ;
-    conversionTable['T'] = 2 ;
-    conversionTable['G'] = 3 ;
-    conversionTable['N'] = 4 ;
-  }
-
-  static inline uint8_t get(uint8_t input) {
-    return conversionTable[input] ;
-  }
-
-};
-
+#include "common_data_structure.h"

 #endif

--- a/public/VectorPairHMM/src/main/c++/utils.cc
+++ b/public/VectorPairHMM/src/main/c++/utils.cc
@ -22,11 +22,8 @@
 *THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-
 #include "headers.h"
-#include "template.h"
 #include "utils.h"
-#include "vector_defs.h"
 #include "LoadTimeInitializer.h"
 using namespace std;

@ -36,51 +33,107 @@ uint8_t ConvertChar::conversionTable[255];
 float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
 double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
 //Static members in ContextBase
+template<>
 bool ContextBase<double>::staticMembersInitializedFlag = false;
-double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+template<>
+double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
+template<>
+double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
+template<>
 bool ContextBase<float>::staticMembersInitializedFlag = false;
-float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+template<>
+float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
+template<>
+float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };


+bool search_file_for_string(string filename, string search_string)
+{
+  ifstream fptr;
+  fptr.open(filename.c_str(),ios::in);
+  if(fptr.is_open())
+  {
+    string buffer;
+    buffer.clear();
+    buffer.resize(4096);
+    bool retvalue = false;
+    while(!fptr.eof())
+    {
+      fptr.getline(&(buffer[0]), 4096);
+      if(buffer.find(search_string) != string::npos)    //found string
+      {
+        retvalue = true;
+        break;
+      }
+    }
+    buffer.clear();
+    fptr.close();
+    return retvalue;
+  }
+  else
+    return false;
+}
+
+bool is_cpuid_ecx_bit_set(int eax, int bitidx)
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__ ("cpuid"
+      :"=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      :"a" (eax)
+      );
+  return (((ecx >> bitidx)&1) == 1);
+}
+
 bool is_avx_supported()
 {
-  return  (_may_i_use_cpu_feature(_FEATURE_AVX) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 28)&1) == 1;
+#ifdef __INTEL_COMPILER
+  bool use_avx = _may_i_use_cpu_feature(_FEATURE_AVX);
+  if(use_avx)
+    return true;
+  else
+  {
+    //check if core supports AVX, but kernel does not and print info message
+    if(!is_cpuid_ecx_bit_set(1, 28))  //core does not support AVX
+      return false;
+    //else fall through to end of function
+  }
+#else
+  if(!__builtin_cpu_supports("avx"))  //core does not support AVX
+    return false;
+  else
+  {
+    //core supports AVX, check if kernel supports
+    if(search_file_for_string("/proc/cpuinfo","avx"))
+      return true;
+    //else fall through to end of function
+
+  }
+#endif  //__INTEL_COMPILER
+  clog << "INFO: Your CPU supports AVX vector instructions, but your kernel does not. Try upgrading to a kernel that supports AVX.\n";
+  clog << "INFO: Your program will run correctly, but slower than the AVX version\n";
+  return false;
 }

 bool is_sse41_supported()
 {
+#ifdef __INTEL_COMPILER
  return  (_may_i_use_cpu_feature(_FEATURE_SSE4_1) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 19)&1) == 1;
+#else
+  return  __builtin_cpu_supports("sse4.1");
+#endif
+  //return is_cpuid_ecx_bit_set(1, 19);
 }

 bool is_sse42_supported()
 {
+#ifdef __INTEL_COMPILER
  return  (_may_i_use_cpu_feature(_FEATURE_SSE4_2) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 20)&1) == 1;
+#else
+  return  __builtin_cpu_supports("sse4.2");
+#endif
+  //return is_cpuid_ecx_bit_set(1, 20);
 }

 uint64_t get_machine_capabilities()
@ -154,10 +207,14 @@ int read_testcase(testcase *tc, FILE* ifp)

 	tc->haplen = strlen(tc->hap);
 	tc->rslen = strlen(tc->rs);
-        assert(strlen(q) == tc->rslen);
-        assert(strlen(i) == tc->rslen);
-        assert(strlen(d) == tc->rslen);
-        assert(strlen(c) == tc->rslen);
+        assert(strlen(q) == (size_t)tc->rslen);
+        assert(strlen(i) == (size_t)tc->rslen);
+        assert(strlen(d) == (size_t)tc->rslen);
+        assert(strlen(c) == (size_t)tc->rslen);
+
+        g_load_time_initializer.update_stat(READ_LENGTH_IDX, tc->rslen); 
+        g_load_time_initializer.update_stat(HAPLOTYPE_LENGTH_IDX, tc->haplen);
+        g_load_time_initializer.update_stat(PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, tc->haplen*tc->rslen);
 	//assert(tc->rslen < MROWS);
        //tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
        //tc->irs = (int *) malloc(tc->rslen*sizeof(int));
@ -279,15 +336,15 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
  tc->c = new char[tc->rslen];
  //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
  memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
-  assert(tokens.size() == 2 + 4*(tc->rslen));
+  assert(tokens.size() == (size_t)(2 + 4*(tc->rslen)));
  //assert(tc->rslen < MROWS);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
 
  if(reformat)
@ -297,16 +354,16 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
    assert(ofptr.is_open());
    ofptr << tokens[0] << " ";
    ofptr << tokens[1] << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->q[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->i[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->d[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->c[j]+33));
    ofptr << " 0 false\n";

@ -363,6 +420,10 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
    ifptr.open(filename);
    assert(ifptr.is_open());
  }
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+  ofstream times_fptr;
+  times_fptr.open("native_timed_intervals.csv",ios::out);
+#endif
  vector<testcase> tc_vector;
  tc_vector.clear();
  testcase tc;
@ -377,11 +438,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
  uint32_t no_kernel_mask = (1 << 17);    //bit 16 user mode, bit 17 kernel mode
  PAPI_num_counters();
  int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
-  char* eventnames[NUM_PAPI_COUNTERS]=  { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" };
+  char* eventnames[NUM_PAPI_COUNTERS]=  { "cycles", "l1_pending_miss", "lfb_hit", "l2_hit" };
  assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION",   &(events[1])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION",   &(events[2])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION",   &(events[3])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("L1D_PEND_MISS:OCCURRENCES",   &(events[1])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:HIT_LFB",   &(events[2])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:L2_HIT",   &(events[3])) == PAPI_OK);
  long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
  long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
 #endif
@ -398,6 +459,9 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
      baseline_results_vec.clear();
      results_vec.resize(tc_vector.size());
      baseline_results_vec.resize(tc_vector.size());
+      g_load_time_initializer.update_stat(NUM_TESTCASES_IDX, tc_vector.size()); 
+      g_load_time_initializer.update_stat(NUM_READS_IDX, tc_vector.size()); 
+      g_load_time_initializer.update_stat(NUM_HAPLOTYPES_IDX, tc_vector.size()); 
      struct timespec start_time;
 #ifdef USE_PAPI
      assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
@ -429,7 +493,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
 #ifdef USE_PAPI
      assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
 #endif
-      vector_compute_time +=  diff_time(start_time);
+      uint64_t curr_interval = diff_time(start_time);
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+      times_fptr << curr_interval << "\n";
+#endif
+      vector_compute_time +=  curr_interval;
 #ifdef USE_PAPI
      for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
        accum_values[k] += values[k];
@ -488,9 +556,12 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
  for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
    cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
 #endif
-
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+  times_fptr.close();
+#endif
  if(use_old_read_testcase)
    fclose(fptr);
  else
    ifptr.close();
+  g_load_time_initializer.print_profiling();
 }
--- a/public/VectorPairHMM/src/main/c++/utils.h
+++ b/public/VectorPairHMM/src/main/c++/utils.h
@ -26,7 +26,7 @@
 #ifndef PAIRHMM_UTIL_H
 #define PAIRHMM_UTIL_H

-#include "template.h"
+#include "common_data_structure.h"

 template<class T>
 std::string to_string(T obj)
@ -50,6 +50,15 @@ extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_lo
 void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
 template<class NUMBER>
 NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_avxd(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_avxs(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_ssed(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_sses(testcase *tc, NUMBER *before_last_log=0);
+
 double getCurrClk();
 void get_time(struct timespec* x);
 uint64_t diff_time(struct timespec& prev_time);
@ -71,5 +80,6 @@ void do_compute(char* filename, bool use_old_read_testcase=true, unsigned chunk_
 /*#define DUMP_COMPUTE_VALUES 1*/
 #define BATCH_SIZE  10000
 #define RUN_HYBRID
+/*#define PRINT_PER_INTERVAL_TIMINGS 1*/

 #endif
--- a/public/VectorPairHMM/src/main/c++/vector_defs.h
+++ b/public/VectorPairHMM/src/main/c++/vector_defs.h
@ -1,55 +0,0 @@
-/*Copyright (c) 2012 The Broad Institute
-
-*Permission is hereby granted, free of charge, to any person
-*obtaining a copy of this software and associated documentation
-*files (the "Software"), to deal in the Software without
-*restriction, including without limitation the rights to use,
-*copy, modify, merge, publish, distribute, sublicense, and/or sell
-*copies of the Software, and to permit persons to whom the
-*Software is furnished to do so, subject to the following
-*conditions:
-
-*The above copyright notice and this permission notice shall be
-*included in all copies or substantial portions of the Software.
-
-*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-#undef SIMD_ENGINE
-#undef SIMD_ENGINE_AVX
-#undef SIMD_ENGINE_SSE
-
-#define SIMD_ENGINE avx
-#define SIMD_ENGINE_AVX
-
-#include "define-float.h"
-#include "vector_function_prototypes.h"
-
-#include "define-double.h"
-#include "vector_function_prototypes.h"
-
-#undef  SIMD_ENGINE
-#undef  SIMD_ENGINE_AVX
-
-#define SIMD_ENGINE sse
-#define SIMD_ENGINE_SSE
-
-
-#include "define-sse-float.h"
-#include "vector_function_prototypes.h"
-
-#include "define-sse-double.h"
-#include "vector_function_prototypes.h"
-
-#undef SIMD_ENGINE
-#undef SIMD_ENGINE_AVX
-#undef SIMD_ENGINE_SSE
-
--- a/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h
+++ b/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h
@ -1,44 +0,0 @@
-/*Copyright (c) 2012 The Broad Institute
-
-*Permission is hereby granted, free of charge, to any person
-*obtaining a copy of this software and associated documentation
-*files (the "Software"), to deal in the Software without
-*restriction, including without limitation the rights to use,
-*copy, modify, merge, publish, distribute, sublicense, and/or sell
-*copies of the Software, and to permit persons to whom the
-*Software is furnished to do so, subject to the following
-*conditions:
-
-*The above copyright notice and this permission notice shall be
-*included in all copies or substantial portions of the Software.
-
-*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
-inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
-inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
-inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
-inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
-inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
-template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
-template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
-    int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
-    SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm,  SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , 
-    SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
-    UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
-inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, 
-    SIMD_TYPE distm, SIMD_TYPE _1_distm);
-inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
-    UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
-    SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
-template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
-
--- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
+++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
@ -78,8 +78,9 @@ public abstract class PairHMM {
    protected double[] mLikelihoodArray;

    //profiling information
-    protected static final boolean doProfiling = true;
-    protected long computeTime = 0;
+    protected static Boolean doProfiling = true;
+    protected static long pairHMMComputeTime = 0;
+    protected long threadLocalPairHMMComputeTimeDiff = 0;
    protected long startTime = 0;

    /**
@ -207,7 +208,13 @@ public abstract class PairHMM {
            }
        }
        if(doProfiling)
-            computeTime += (System.nanoTime() - startTime);
+        {
+            threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
+            //synchronized(doProfiling)
+            {
+                pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
+            }
+        }
        return likelihoodMap;
    }

@ -314,6 +321,17 @@ public abstract class PairHMM {
        return Math.min(haplotype1.length, haplotype2.length);
    }

+    /**
+     * Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1
+     * This function should be called only during initialization phase - single thread phase of HC
+     */
+    public static void setNumberOfThreads(final int numThreads)
+    {
+        doProfiling = (numThreads == 1);
+        if(numThreads > 1)
+            logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n");
+    }
+
    /**
     * Return the results of the computeLikelihoods function
     */
@ -324,6 +342,6 @@ public abstract class PairHMM {
    public void close()
    {
        if(doProfiling)
-            System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9));
+            System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9));
    }
 }
--- a/public/sting-utils/src/main/resources/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so
+++ b/public/sting-utils/src/main/resources/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so