Merge pull request #573 from broadinstitute/intel_pairhmm

Intel pairhmm
2014-05-05 16:27:04 -04:00 · 2014-05-05 16:27:04 -04:00 · 587e81fbd9
parent 38b7cfbda9 f6ea25b4d1
commit 587e81fbd9
31 changed files with 548 additions and 527 deletions
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@ -417,7 +417,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
     */
    @Hidden
    @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
-    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
+    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING;

    @Hidden
    @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
@ -639,6 +639,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
            logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
        }

+        //static member function - set number of threads
+        PairHMM.setNumberOfThreads(getToolkit().getTotalNumberOfThreads());
        // create our likelihood calculation engine
        likelihoodCalculationEngine = createLikelihoodCalculationEngine();

--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java
@ -58,6 +58,7 @@ import java.util.HashMap;
 */
 public abstract class JNILoglessPairHMM extends LoglessPairHMM {
    public abstract HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap();
-    protected long setupTime = 0;
+    protected long threadLocalSetupTimeDiff = 0;
+    protected static long pairHMMSetupTime = 0;
    
 }
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java
@ -228,7 +228,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {

        mLikelihoodArray = new double[readListSize*numHaplotypes];      //to store results
        if(doProfiling)
-            setupTime += (System.nanoTime() - startTime);
+            threadLocalSetupTimeDiff = (System.nanoTime() - startTime);
        //for(reads)
        //   for(haplotypes)
        //       compute_full_prob()
@ -251,7 +251,14 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
            readIdx += numHaplotypes;
        }
        if(doProfiling)
-            computeTime += (System.nanoTime() - startTime);
+        {
+            threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
+            //synchronized(doProfiling)
+            {
+                pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
+                pairHMMSetupTime += threadLocalSetupTimeDiff;
+            }
+        }
        return likelihoodMap;
    }
    
@ -262,7 +269,8 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
    @Override
    public void close()
    {
-        System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9));
+        if(doProfiling)
+            System.out.println("Time spent in setup for JNI call : "+(pairHMMSetupTime*1e-9));
        super.close();
        jniClose();
    }
--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
        final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);

        final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
-        final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("4ccdbebcfd02be87ae5b4ad94666f011"));
+        final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("214799b5d1c0809f042c557155ea6e13"));
        specAnn.disableShadowBCF();
        final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);

--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java
@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa

    @Test
    public void testHaplotypeCallerMultiSampleComplex1() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "65c316f1f3987d7bc94e887999920d45");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d1626d5fff419b8a782de954224e881f");
    }

    private void HCTestSymbolicVariants(String bam, String args, String md5) {
--- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
    public void HCTestConservativePcrIndelModelWGS() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
-                Arrays.asList("7592274ecd2b5ef4624dd2ed659536ef"));
+                Arrays.asList("1b41bf83b9c249a9a5fffb1e308668c1"));
        executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
    }

--- a/public/VectorPairHMM/README.md
+++ b/public/VectorPairHMM/README.md
@ -44,9 +44,10 @@ end of every region (line 351 in PairHMMLikelihoodCalculationEngine).
 Note: Debug code has been moved to a separate class DebugJNILoglessPairHMM.java.

 Compiling:
-Make sure you have icc (Intel C compiler) available. Currently, gcc does not seem to support all AVX 
-intrinsics.
-This native library is called libVectorLoglessPairHMM.so
+The native library (called libVectorLoglessPairHMM.so) can be compiled with icc (Intel C compiler) 
+or gcc versions >= 4.8.1 that support AVX intrinsics. By default, the make process tries to invoke 
+icc. To use gcc, edit the file 'pom.xml' (in this directory) and enable the environment variables 
+USE_GCC,C_COMPILER and CPP_COMPILER (edit and uncomment lines 60-62).
 Using Maven:
 Type 'mvn install' in this directory - this will build the library (by invoking 'make') and copy the 
 native library to the directory 
--- a/public/VectorPairHMM/pom.xml
+++ b/public/VectorPairHMM/pom.xml
@ -5,7 +5,7 @@
    <parent>
        <groupId>org.broadinstitute.sting</groupId>
        <artifactId>sting-root</artifactId>
-        <version>2.8-SNAPSHOT</version>
+        <version>3.2-SNAPSHOT</version>
        <relativePath>../../public/sting-root</relativePath>
    </parent>

@ -57,6 +57,9 @@
                            <workingDirectory>src/main/c++</workingDirectory>
                            <environmentVariables>
                                <JRE_HOME>${java.home}</JRE_HOME>
+                                <!--<USE_GCC>1</USE_GCC>-->
+                                <!--<C_COMPILER>/opt/gcc-4.8.2/bin/gcc</C_COMPILER>-->
+                                <!--<CPP_COMPILER>/opt/gcc-4.8.2/bin/g++</CPP_COMPILER>-->
                                <OUTPUT_DIR>${project.build.directory}</OUTPUT_DIR>
                            </environmentVariables>
                        </configuration>
--- a/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc
+++ b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc
@ -23,8 +23,8 @@
 */


-#include "LoadTimeInitializer.h"
 #include "utils.h"
+#include "LoadTimeInitializer.h"
 using namespace std;
 char* LoadTimeInitializerStatsNames[] = 
 {
@ -43,6 +43,10 @@ LoadTimeInitializer g_load_time_initializer;

 LoadTimeInitializer::LoadTimeInitializer()		//will be called when library is loaded
 {
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__INTEL_COMPILER)
+  //compiles only with gcc >= 4.8
+  __builtin_cpu_init();
+#endif
  ConvertChar::init();
 #ifndef DISABLE_FTZ
  //Very important to get good performance on Intel processors
@ -71,11 +75,6 @@ LoadTimeInitializer::LoadTimeInitializer()		//will be called when library is loa
  m_filename_to_fptr.clear();
  m_written_files_set.clear();

-  //Common buffer - 8MB
-  unsigned size = 1024*1024;
-  m_buffer = new uint64_t[size];
-  m_buffer_size = size*sizeof(uint64_t);
-  
  initialize_function_pointers();

  //Initialize static members of class
--- a/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h
+++ b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h
@ -27,7 +27,7 @@
 #define LOAD_TIME_INITIALIZER_H
 #include "headers.h"
 #include <jni.h>
-#include "template.h"
+/*#include "template.h"*/

 enum LoadTimeInitializerStatsEnum
 {
@ -47,10 +47,6 @@ class LoadTimeInitializer
 {
  public:
    LoadTimeInitializer();		//will be called when library is loaded
-    ~LoadTimeInitializer()
-    {
-      delete[] m_buffer;
-    }
    void print_profiling();
    void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
    void debug_close();
@ -72,8 +68,6 @@ class LoadTimeInitializer
    uint64_t m_data_transfer_time;
    //bytes copied
    uint64_t m_bytes_copied;
-    unsigned get_buffer_size() { return m_buffer_size; }
-    char* get_buffer() { return (char*)m_buffer; }
  private:
    std::map<std::string, std::ofstream*> m_filename_to_fptr;
    std::set<std::string> m_written_files_set;
@ -83,8 +77,6 @@ class LoadTimeInitializer
    double m_sum_square_stats[TOTAL_NUMBER_STATS];
    uint64_t m_min_stats[TOTAL_NUMBER_STATS];
    uint64_t m_max_stats[TOTAL_NUMBER_STATS];
-    unsigned m_buffer_size;
-    uint64_t* m_buffer;
 };
 extern LoadTimeInitializer g_load_time_initializer;

--- a/public/VectorPairHMM/src/main/c++/Makefile
+++ b/public/VectorPairHMM/src/main/c++/Makefile
@ -32,14 +32,30 @@
 JRE_HOME?=/opt/jdk1.7.0_25/jre
 JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JRE_HOME}/../include -I${JRE_HOME}/../include/linux

-COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
-CC=icc
-CXX=icc
+#COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
+COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -Wall $(OMPCFLAGS) -Wno-unknown-pragmas -Wno-write-strings -Wno-unused-variable -Wno-unused-but-set-variable
+ifdef DISABLE_FTZ
+  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ
+endif
+
+ifdef USE_GCC
+  C_COMPILER?=gcc
+  CPP_COMPILER?=g++
+  AVX_FLAGS=-mavx
+  SSE41_FLAGS=-msse4.1
+  COMMON_COMPILATION_FLAGS+=-Wno-char-subscripts
+else
+  C_COMPILER?=icc
+  CPP_COMPILER?=icc
+  AVX_FLAGS=-xAVX
+  SSE41_FLAGS=-xSSE4.1
+  LIBFLAGS=-static-intel
+  ifdef DISABLE_FTZ
+    COMMON_COMPILATION_FLAGS+=-no-ftz
+  endif
+endif

 LDFLAGS=-lm -lrt $(OMPLDFLAGS)
-ifdef DISABLE_FTZ
-  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
-endif

 PAPI_DIR=/home/karthikg/softwares/papi-5.3.0
 ifdef USE_PAPI
@ -49,10 +65,6 @@ ifdef USE_PAPI
  endif
 endif

-ifdef DISABLE_FTZ
-  COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
-endif
-
 BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker
 #BIN=checker

@ -79,8 +91,8 @@ NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
 AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
 SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
 $(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
-$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
-$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
+$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(AVX_FLAGS)
+$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(SSE41_FLAGS)
 OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)

 all: $(BIN) Sandbox.class copied_lib
@ -88,18 +100,18 @@ all: $(BIN) Sandbox.class copied_lib
 -include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))

 checker: pairhmm-1-base.o $(COMMON_OBJECTS)
-	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
+	$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)

 pairhmm-template-main:	pairhmm-template-main.o $(COMMON_OBJECTS)
-	$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
+	$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)

 libVectorLoglessPairHMM.so: $(LIBOBJECTS) 
-	$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
+	$(CPP_COMPILER) $(OMPLFLAGS) -shared $(LIBFLAGS) -o $@ $(LIBOBJECTS) ${LDFLAGS}


 $(OBJECTS): %.o: %.cc
 	@mkdir -p $(DEPDIR)
-	$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
+	$(CPP_COMPILER) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<

 Sandbox.class: Sandbox.java
 	javac Sandbox.java
--- a/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc
+++ b/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc
@ -23,7 +23,6 @@
 */


-#include "template.h"

 #undef SIMD_ENGINE
 #undef SIMD_ENGINE_SSE
@ -31,6 +30,8 @@
 #define SIMD_ENGINE avx
 #define SIMD_ENGINE_AVX

+#include "template.h"
+
 #include "define-float.h"
 #include "shift_template.c"
 #include "pairhmm-template-kernel.cc"
--- a/public/VectorPairHMM/src/main/c++/baseline.cc
+++ b/public/VectorPairHMM/src/main/c++/baseline.cc
@ -24,7 +24,7 @@


 #include "headers.h"
-#include "template.h"
+#include "common_data_structure.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"
 using namespace std;
@ -48,17 +48,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
  //a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
  //the chance of being stored together in the cache. Also, prefetchers can learn memory access
  //patterns for 2D arrays, not possible for array of pointers
-  //bool locally_allocated = false;
  //NUMBER* common_buffer = 0;
  NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
-  //unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
-  //if(true)
-  //{
-  //common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
-  //locally_allocated = true;
-  //}
-  //else
-  //common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer()); 
  //pointers to within the allocated buffer
  NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
  NUMBER* ptr = common_buffer;
@ -154,7 +145,6 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)

 #ifndef USE_STACK_ALLOCATION
  delete[] common_pointer_buffer;
-  //if(locally_allocated)
  delete[] common_buffer;
 #endif

--- a/public/VectorPairHMM/src/main/c++/common_data_structure.h
+++ b/public/VectorPairHMM/src/main/c++/common_data_structure.h
@ -0,0 +1,215 @@
+#ifndef COMMON_DATA_STRUCTURE_H
+#define COMMON_DATA_STRUCTURE_H
+
+#include "headers.h"
+
+#define CAT(X,Y) X####Y
+#define CONCAT(X,Y) CAT(X,Y)
+
+#define MM 0
+#define GapM 1
+#define MX 2
+#define XX 3
+#define MY 4
+#define YY 5
+
+//#define MROWS  500
+//#define MCOLS  1000
+
+
+#define MAX_QUAL 254
+#define MAX_JACOBIAN_TOLERANCE 8.0
+#define JACOBIAN_LOG_TABLE_STEP 0.0001
+#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
+#define MAXN 70000
+#define LOG10_CACHE_SIZE  (4*MAXN)  // we need to be able to go up to 2*(2N) when calculating some of the coefficients
+#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
+
+template<class NUMBER>
+struct ContextBase
+{
+  public:
+    NUMBER ph2pr[128];
+    NUMBER INITIAL_CONSTANT;
+    NUMBER LOG10_INITIAL_CONSTANT;
+    NUMBER RESULT_THRESHOLD; 
+
+    static bool staticMembersInitializedFlag;
+    static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
+    static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+
+    static void initializeStaticMembers()
+    {
+      if(!staticMembersInitializedFlag)
+      {
+        //Order of calls important - Jacobian first, then MatchToMatch
+        initializeJacobianLogTable();
+        initializeMatchToMatchProb();
+        staticMembersInitializedFlag = true;
+      }
+    }
+
+    static void deleteStaticMembers()
+    {
+      if(staticMembersInitializedFlag)
+      {
+        staticMembersInitializedFlag = false;
+      }
+    }
+
+    //Called only once during library load - don't bother to optimize with single precision fp
+    static void initializeJacobianLogTable()
+    {
+      for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
+        jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
+      }
+    }
+
+    //Called only once per library load - don't bother optimizing with single fp
+    static void initializeMatchToMatchProb()
+    {
+      double LN10 = log(10);
+      double INV_LN10 = 1.0/LN10;
+      for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
+        for (int j = 0; j <= i; j++) {
+          double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
+          double matchToMatchLog10 =
+            log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
+          matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
+        }
+    }
+    //Called during computation - use single precision where possible
+    static int fastRound(NUMBER d) {
+      return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
+    }
+    //Called during computation - use single precision where possible
+    static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
+      // make sure small is really the smaller value
+      if (small > big) {
+        NUMBER t = big;
+        big = small;
+        small = t;
+      }
+
+      if (isinf(small) == -1 || isinf(big) == -1)
+        return big;
+
+      NUMBER diff = big - small;
+      if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
+        return big;
+
+      // OK, so |y-x| < tol: we use the following identity then:
+      // we need to compute log10(10^x + 10^y)
+      // By Jacobian logarithm identity, this is equal to
+      // max(x,y) + log10(1+10^-abs(x-y))
+      // we compute the second term as a table lookup with integer quantization
+      // we have pre-stored correction for 0,0.1,0.2,... 10.0
+      int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
+      return big + jacobianLogTable[ind];
+    }
+};
+
+template<class NUMBER>
+struct Context : public ContextBase<NUMBER>
+{};
+
+template<>
+struct Context<double> : public ContextBase<double>
+{
+  Context():ContextBase<double>()
+  {
+    for (int x = 0; x < 128; x++)
+      ph2pr[x] = pow(10.0, -((double)x) / 10.0);
+
+    INITIAL_CONSTANT = ldexp(1.0, 1020.0);
+    LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
+    RESULT_THRESHOLD = 0.0;
+  }
+
+  double LOG10(double v){ return log10(v); }
+  inline double POW(double b, double e) { return pow(b,e); }
+
+  static double _(double n){ return n; }
+  static double _(float n){ return ((double) n); }
+};
+
+template<>
+struct Context<float> : public ContextBase<float>
+{
+  Context() : ContextBase<float>()
+  {
+    for (int x = 0; x < 128; x++)
+    {
+      ph2pr[x] = powf(10.f, -((float)x) / 10.f);
+    }
+
+    INITIAL_CONSTANT = ldexpf(1.f, 120.f);
+    LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
+    RESULT_THRESHOLD = ldexpf(1.f, -110.f);
+  }
+
+  float LOG10(float v){ return log10f(v); }
+  inline float POW(float b, float e) { return powf(b,e); }
+
+  static float _(double n){ return ((float) n); }
+  static float _(float n){ return n; }
+};
+
+#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual)                       \
+{                                                                               \
+  int minQual = delQual;                                                        \
+  int maxQual = insQual;                                                        \
+  if (insQual <= delQual)                                                       \
+  {                                                                             \
+    minQual = insQual;                                                          \
+    maxQual = delQual;                                                          \
+  }                                                                             \
+  (output) = (MAX_QUAL < maxQual) ?                                             \
+  ((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual))       \
+  : ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual];           \
+}
+
+
+
+typedef struct
+{
+        int rslen, haplen;
+	/*int *q, *i, *d, *c;*/
+	/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
+	char *q, *i, *d, *c;
+        char *hap, *rs;
+	int *ihap;
+	int *irs;
+} testcase;
+
+#define MIN_ACCEPTED 1e-28f
+#define NUM_DISTINCT_CHARS 5
+#define AMBIG_CHAR 4
+
+class ConvertChar {
+
+  static uint8_t conversionTable[255] ;
+
+public:
+
+  static void init() {
+    assert (NUM_DISTINCT_CHARS == 5) ;
+    assert (AMBIG_CHAR == 4) ;
+
+    conversionTable['A'] = 0 ;
+    conversionTable['C'] = 1 ;
+    conversionTable['T'] = 2 ;
+    conversionTable['G'] = 3 ;
+    conversionTable['N'] = 4 ;
+  }
+
+  static inline uint8_t get(uint8_t input) {
+    return conversionTable[input] ;
+  }
+
+};
+
+int normalize(char c);
+int read_testcase(testcase *tc, FILE* ifp=0);
+
+#endif
--- a/public/VectorPairHMM/src/main/c++/define-double.h
+++ b/public/VectorPairHMM/src/main/c++/define-double.h
@ -42,33 +42,33 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-float.h
+++ b/public/VectorPairHMM/src/main/c++/define-float.h
@ -42,33 +42,33 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-sse-double.h
+++ b/public/VectorPairHMM/src/main/c++/define-sse-double.h
@ -40,35 +40,35 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef VEC_EXTRACT_UNIT
+#undef VEC_INSERT_UNIT
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/define-sse-float.h
+++ b/public/VectorPairHMM/src/main/c++/define-sse-float.h
@ -40,35 +40,35 @@
 #undef MASK_TYPE
 #undef MASK_ALL_ONES

-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_INSERT_UNIT(__v1,__ins,__im)
-#undef SET_VEC_ZERO(__vec)
-#undef VEC_OR(__v1, __v2)
-#undef VEC_ADD(__v1, __v2)
-#undef VEC_SUB(__v1, __v2)
-#undef VEC_MUL(__v1, __v2)
-#undef VEC_DIV(__v1, __v2)
-#undef VEC_BLEND(__v1, __v2, __mask)
-#undef VEC_BLENDV(__v1, __v2, __maskV)
-#undef VEC_CAST_256_128(__v1)
-#undef VEC_EXTRACT_128(__v1, __im)
-#undef VEC_EXTRACT_UNIT(__v1, __im)
-#undef VEC_SET1_VAL128(__val)
-#undef VEC_MOVE(__v1, __val)
-#undef VEC_CAST_128_256(__v1)
-#undef VEC_INSERT_VAL(__v1, __val, __pos)
-#undef VEC_CVT_128_256(__v1)
-#undef VEC_SET1_VAL(__val)
-#undef VEC_POPCVT_CHAR(__ch)
-#undef VEC_LDPOPCVT_CHAR(__addr)
-#undef VEC_CMP_EQ(__v1, __v2)
-#undef VEC_SET_LSE(__val)
-#undef SHIFT_HAP(__v1, __val)
+#undef VEC_EXTRACT_UNIT
+#undef VEC_INSERT_UNIT
+#undef SET_VEC_ZERO
+#undef VEC_OR
+#undef VEC_ADD
+#undef VEC_SUB
+#undef VEC_MUL
+#undef VEC_DIV
+#undef VEC_BLEND
+#undef VEC_BLENDV
+#undef VEC_CAST_256_128
+#undef VEC_EXTRACT_128
+#undef VEC_EXTRACT_UNIT
+#undef VEC_SET1_VAL128
+#undef VEC_MOVE
+#undef VEC_CAST_128_256
+#undef VEC_INSERT_VAL
+#undef VEC_CVT_128_256
+#undef VEC_SET1_VAL
+#undef VEC_POPCVT_CHAR
+#undef VEC_LDPOPCVT_CHAR
+#undef VEC_CMP_EQ
+#undef VEC_SET_LSE
+#undef SHIFT_HAP
 #undef MASK_VEC
-#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
-#undef VEC_SHIFT_LEFT_1BIT(__vs)
+#undef VEC_SSE_TO_AVX
+#undef VEC_SHIFT_LEFT_1BIT
 #undef MASK_ALL_ONES
-#undef COMPARE_VECS(__v1, __v2)
+#undef COMPARE_VECS
 #undef _256_INT_TYPE
 #undef BITMASK_VEC
 #endif
--- a/public/VectorPairHMM/src/main/c++/jni_common.h
+++ b/public/VectorPairHMM/src/main/c++/jni_common.h
@ -37,7 +37,7 @@
 /*#define DUMP_TO_SANDBOX 1*/


-#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
+/*#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1*/

 #ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
 //Gets direct access to Java arrays
--- a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc
+++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc
@ -26,7 +26,6 @@
 #include "headers.h"
 #include "jni_common.h"
 #include "org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h"
-#include "template.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"
 #include "jnidebug.h"
--- a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
+++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc
@ -26,7 +26,7 @@
 #include "headers.h"
 #include "jni_common.h"
 #include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h"
-#include "template.h"
+//#include "template.h"
 #include "utils.h"
 #include "LoadTimeInitializer.h"

--- a/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc
+++ b/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc
@ -29,7 +29,6 @@ using namespace std;

 int main(int argc, char** argv)
 {
-#define BATCH_SIZE 10000
  if(argc < 2)
  {
    cerr << "Needs path to input file as argument\n";
--- a/public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc
+++ b/public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc
@ -24,12 +24,11 @@


 #include "headers.h"
-#include "template.h"
-#include "vector_defs.h"

 #define SIMD_ENGINE avx
 #define SIMD_ENGINE_AVX

+#include "utils.h"

 #define BATCH_SIZE  10000
 #define RUN_HYBRID
--- a/public/VectorPairHMM/src/main/c++/sse_function_instantiations.cc
+++ b/public/VectorPairHMM/src/main/c++/sse_function_instantiations.cc
@ -23,7 +23,6 @@
 */


-#include "template.h"

 #undef SIMD_ENGINE
 #undef SIMD_ENGINE_AVX
@ -31,6 +30,8 @@
 #define SIMD_ENGINE sse
 #define SIMD_ENGINE_SSE

+#include "template.h"
+
 #include "define-sse-float.h"
 #include "shift_template.c"
 #include "pairhmm-template-kernel.cc"
--- a/public/VectorPairHMM/src/main/c++/template.h
+++ b/public/VectorPairHMM/src/main/c++/template.h
@ -28,27 +28,17 @@

 #include "headers.h"

-#define MM 0
-#define GapM 1
-#define MX 2
-#define XX 3
-#define MY 4
-#define YY 5
-
-//#define MROWS  500
-//#define MCOLS  1000
-
-#define CAT(X,Y) X####Y
-#define CONCAT(X,Y) CAT(X,Y)

 #define ALIGNED __attribute__((aligned(32)))

+#ifdef SIMD_ENGINE_AVX
 typedef union __attribute__((aligned(32))) {
        ALIGNED __m256 ALIGNED d;
        ALIGNED __m128i ALIGNED s[2];
        ALIGNED float  ALIGNED f[8];
        ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_F ALIGNED;
+#endif

 typedef union __attribute__((aligned(32))) {
        ALIGNED __m128 ALIGNED d;
@ -81,12 +71,14 @@ typedef union ALIGNED
        ALIGNED float  ALIGNED f;
 } ALIGNED IF_32 ALIGNED;

+#ifdef SIMD_ENGINE_AVX
 typedef union __attribute__((aligned(32))) {
        ALIGNED __m256d ALIGNED d;
        ALIGNED __m128i ALIGNED s[2];
        ALIGNED double  ALIGNED f[4];
        ALIGNED __m256i ALIGNED i;
 } ALIGNED mix_D ALIGNED;
+#endif

 typedef union __attribute__((aligned(32))) {
        ALIGNED __m128d ALIGNED d;
@ -120,200 +112,7 @@ typedef union ALIGNED
 } ALIGNED IF_64 ALIGNED;


-#define MAX_QUAL 254
-#define MAX_JACOBIAN_TOLERANCE 8.0
-#define JACOBIAN_LOG_TABLE_STEP 0.0001
-#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
-#define MAXN 70000
-#define LOG10_CACHE_SIZE  (4*MAXN)  // we need to be able to go up to 2*(2N) when calculating some of the coefficients
-#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
-
-template<class NUMBER>
-struct ContextBase
-{
-  public:
-    NUMBER ph2pr[128];
-    NUMBER INITIAL_CONSTANT;
-    NUMBER LOG10_INITIAL_CONSTANT;
-    NUMBER RESULT_THRESHOLD; 
-
-    static bool staticMembersInitializedFlag;
-    static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-    static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
-
-    static void initializeStaticMembers()
-    {
-      if(!staticMembersInitializedFlag)
-      {
-        //Order of calls important - Jacobian first, then MatchToMatch
-        initializeJacobianLogTable();
-        initializeMatchToMatchProb();
-        staticMembersInitializedFlag = true;
-      }
-    }
-
-    static void deleteStaticMembers()
-    {
-      if(staticMembersInitializedFlag)
-      {
-        staticMembersInitializedFlag = false;
-      }
-    }
-
-    //Called only once during library load - don't bother to optimize with single precision fp
-    static void initializeJacobianLogTable()
-    {
-      for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
-        jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
-      }
-    }
-
-    //Called only once per library load - don't bother optimizing with single fp
-    static void initializeMatchToMatchProb()
-    {
-      double LN10 = log(10);
-      double INV_LN10 = 1.0/LN10;
-      for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
-        for (int j = 0; j <= i; j++) {
-          double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
-          double matchToMatchLog10 =
-            log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
-          matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
-        }
-    }
-    //Called during computation - use single precision where possible
-    static int fastRound(NUMBER d) {
-      return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
-    }
-    //Called during computation - use single precision where possible
-    static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
-      // make sure small is really the smaller value
-      if (small > big) {
-        NUMBER t = big;
-        big = small;
-        small = t;
-      }
-
-      if (isinf(small) == -1 || isinf(big) == -1)
-        return big;
-
-      NUMBER diff = big - small;
-      if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
-        return big;
-
-      // OK, so |y-x| < tol: we use the following identity then:
-      // we need to compute log10(10^x + 10^y)
-      // By Jacobian logarithm identity, this is equal to
-      // max(x,y) + log10(1+10^-abs(x-y))
-      // we compute the second term as a table lookup with integer quantization
-      // we have pre-stored correction for 0,0.1,0.2,... 10.0
-      int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
-      return big + jacobianLogTable[ind];
-    }
-};
-
-template<class NUMBER>
-struct Context : public ContextBase<NUMBER>
-{};
-
-template<>
-struct Context<double> : public ContextBase<double>
-{
-  Context():ContextBase<double>()
-  {
-    for (int x = 0; x < 128; x++)
-      ph2pr[x] = pow(10.0, -((double)x) / 10.0);
-
-    INITIAL_CONSTANT = ldexp(1.0, 1020.0);
-    LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
-    RESULT_THRESHOLD = 0.0;
-  }
-
-  double LOG10(double v){ return log10(v); }
-  inline double POW(double b, double e) { return pow(b,e); }
-
-  static double _(double n){ return n; }
-  static double _(float n){ return ((double) n); }
-};
-
-template<>
-struct Context<float> : public ContextBase<float>
-{
-  Context() : ContextBase<float>()
-  {
-    for (int x = 0; x < 128; x++)
-    {
-      ph2pr[x] = powf(10.f, -((float)x) / 10.f);
-    }
-
-    INITIAL_CONSTANT = ldexpf(1.f, 120.f);
-    LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
-    RESULT_THRESHOLD = ldexpf(1.f, -110.f);
-  }
-
-  float LOG10(float v){ return log10f(v); }
-  inline float POW(float b, float e) { return powf(b,e); }
-
-  static float _(double n){ return ((float) n); }
-  static float _(float n){ return n; }
-};
-
-#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual)                       \
-{                                                                               \
-  int minQual = delQual;                                                        \
-  int maxQual = insQual;                                                        \
-  if (insQual <= delQual)                                                       \
-  {                                                                             \
-    minQual = insQual;                                                          \
-    maxQual = delQual;                                                          \
-  }                                                                             \
-  (output) = (MAX_QUAL < maxQual) ?                                             \
-  ((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual))       \
-  : ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual];           \
-}
-
-typedef struct
-{
-        int rslen, haplen;
-	/*int *q, *i, *d, *c;*/
-	/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
-	char *q, *i, *d, *c;
-        char *hap, *rs;
-	int *ihap;
-	int *irs;
-} testcase;
-
-int normalize(char c);
-int read_testcase(testcase *tc, FILE* ifp=0);
-
-
-#define MIN_ACCEPTED 1e-28f
-#define NUM_DISTINCT_CHARS 5
-#define AMBIG_CHAR 4
-
-class ConvertChar {
-
-  static uint8_t conversionTable[255] ;
-
-public:
-
-  static void init() {
-    assert (NUM_DISTINCT_CHARS == 5) ;
-    assert (AMBIG_CHAR == 4) ;
-
-    conversionTable['A'] = 0 ;
-    conversionTable['C'] = 1 ;
-    conversionTable['T'] = 2 ;
-    conversionTable['G'] = 3 ;
-    conversionTable['N'] = 4 ;
-  }
-
-  static inline uint8_t get(uint8_t input) {
-    return conversionTable[input] ;
-  }
-
-};
-
+#include "common_data_structure.h"

 #endif

--- a/public/VectorPairHMM/src/main/c++/utils.cc
+++ b/public/VectorPairHMM/src/main/c++/utils.cc
@ -22,11 +22,8 @@
 *THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-
 #include "headers.h"
-#include "template.h"
 #include "utils.h"
-#include "vector_defs.h"
 #include "LoadTimeInitializer.h"
 using namespace std;

@ -36,51 +33,107 @@ uint8_t ConvertChar::conversionTable[255];
 float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
 double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
 //Static members in ContextBase
+template<>
 bool ContextBase<double>::staticMembersInitializedFlag = false;
-double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+template<>
+double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
+template<>
+double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
+template<>
 bool ContextBase<float>::staticMembersInitializedFlag = false;
-float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
-float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
+template<>
+float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
+template<>
+float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };


+bool search_file_for_string(string filename, string search_string)
+{
+  ifstream fptr;
+  fptr.open(filename.c_str(),ios::in);
+  if(fptr.is_open())
+  {
+    string buffer;
+    buffer.clear();
+    buffer.resize(4096);
+    bool retvalue = false;
+    while(!fptr.eof())
+    {
+      fptr.getline(&(buffer[0]), 4096);
+      if(buffer.find(search_string) != string::npos)    //found string
+      {
+        retvalue = true;
+        break;
+      }
+    }
+    buffer.clear();
+    fptr.close();
+    return retvalue;
+  }
+  else
+    return false;
+}
+
+bool is_cpuid_ecx_bit_set(int eax, int bitidx)
+{
+  int ecx = 0, edx = 0, ebx = 0;
+  __asm__ ("cpuid"
+      :"=b" (ebx),
+      "=c" (ecx),
+      "=d" (edx)
+      :"a" (eax)
+      );
+  return (((ecx >> bitidx)&1) == 1);
+}
+
 bool is_avx_supported()
 {
-  return  (_may_i_use_cpu_feature(_FEATURE_AVX) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 28)&1) == 1;
+#ifdef __INTEL_COMPILER
+  bool use_avx = _may_i_use_cpu_feature(_FEATURE_AVX);
+  if(use_avx)
+    return true;
+  else
+  {
+    //check if core supports AVX, but kernel does not and print info message
+    if(!is_cpuid_ecx_bit_set(1, 28))  //core does not support AVX
+      return false;
+    //else fall through to end of function
+  }
+#else
+  if(!__builtin_cpu_supports("avx"))  //core does not support AVX
+    return false;
+  else
+  {
+    //core supports AVX, check if kernel supports
+    if(search_file_for_string("/proc/cpuinfo","avx"))
+      return true;
+    //else fall through to end of function
+
+  }
+#endif  //__INTEL_COMPILER
+  clog << "INFO: Your CPU supports AVX vector instructions, but your kernel does not. Try upgrading to a kernel that supports AVX.\n";
+  clog << "INFO: Your program will run correctly, but slower than the AVX version\n";
+  return false;
 }

 bool is_sse41_supported()
 {
+#ifdef __INTEL_COMPILER
  return  (_may_i_use_cpu_feature(_FEATURE_SSE4_1) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 19)&1) == 1;
+#else
+  return  __builtin_cpu_supports("sse4.1");
+#endif
+  //return is_cpuid_ecx_bit_set(1, 19);
 }

 bool is_sse42_supported()
 {
+#ifdef __INTEL_COMPILER
  return  (_may_i_use_cpu_feature(_FEATURE_SSE4_2) > 0);
-  //int ecx = 0, edx = 0, ebx = 0;
-  //__asm__("cpuid"
-      //: "=b" (ebx),
-      //"=c" (ecx),
-      //"=d" (edx)
-      //: "a" (1)
-      //);
-  //return ((ecx >> 20)&1) == 1;
+#else
+  return  __builtin_cpu_supports("sse4.2");
+#endif
+  //return is_cpuid_ecx_bit_set(1, 20);
 }

 uint64_t get_machine_capabilities()
@ -154,10 +207,14 @@ int read_testcase(testcase *tc, FILE* ifp)

 	tc->haplen = strlen(tc->hap);
 	tc->rslen = strlen(tc->rs);
-        assert(strlen(q) == tc->rslen);
-        assert(strlen(i) == tc->rslen);
-        assert(strlen(d) == tc->rslen);
-        assert(strlen(c) == tc->rslen);
+        assert(strlen(q) == (size_t)tc->rslen);
+        assert(strlen(i) == (size_t)tc->rslen);
+        assert(strlen(d) == (size_t)tc->rslen);
+        assert(strlen(c) == (size_t)tc->rslen);
+
+        g_load_time_initializer.update_stat(READ_LENGTH_IDX, tc->rslen); 
+        g_load_time_initializer.update_stat(HAPLOTYPE_LENGTH_IDX, tc->haplen);
+        g_load_time_initializer.update_stat(PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, tc->haplen*tc->rslen);
 	//assert(tc->rslen < MROWS);
        //tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
        //tc->irs = (int *) malloc(tc->rslen*sizeof(int));
@ -279,15 +336,15 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
  tc->c = new char[tc->rslen];
  //cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
  memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
-  assert(tokens.size() == 2 + 4*(tc->rslen));
+  assert(tokens.size() == (size_t)(2 + 4*(tc->rslen)));
  //assert(tc->rslen < MROWS);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
-  for(unsigned j=0;j<tc->rslen;++j)
+  for(int j=0;j<tc->rslen;++j)
    tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
 
  if(reformat)
@ -297,16 +354,16 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
    assert(ofptr.is_open());
    ofptr << tokens[0] << " ";
    ofptr << tokens[1] << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->q[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->i[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->d[j]+33));
    ofptr << " ";
-    for(unsigned j=0;j<tc->rslen;++j)
+    for(int j=0;j<tc->rslen;++j)
      ofptr << ((char)(tc->c[j]+33));
    ofptr << " 0 false\n";

@ -363,6 +420,10 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
    ifptr.open(filename);
    assert(ifptr.is_open());
  }
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+  ofstream times_fptr;
+  times_fptr.open("native_timed_intervals.csv",ios::out);
+#endif
  vector<testcase> tc_vector;
  tc_vector.clear();
  testcase tc;
@ -377,11 +438,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
  uint32_t no_kernel_mask = (1 << 17);    //bit 16 user mode, bit 17 kernel mode
  PAPI_num_counters();
  int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
-  char* eventnames[NUM_PAPI_COUNTERS]=  { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" };
+  char* eventnames[NUM_PAPI_COUNTERS]=  { "cycles", "l1_pending_miss", "lfb_hit", "l2_hit" };
  assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION",   &(events[1])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION",   &(events[2])) == PAPI_OK);
-  assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION",   &(events[3])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("L1D_PEND_MISS:OCCURRENCES",   &(events[1])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:HIT_LFB",   &(events[2])) == PAPI_OK);
+  assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:L2_HIT",   &(events[3])) == PAPI_OK);
  long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
  long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
 #endif
@ -398,6 +459,9 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
      baseline_results_vec.clear();
      results_vec.resize(tc_vector.size());
      baseline_results_vec.resize(tc_vector.size());
+      g_load_time_initializer.update_stat(NUM_TESTCASES_IDX, tc_vector.size()); 
+      g_load_time_initializer.update_stat(NUM_READS_IDX, tc_vector.size()); 
+      g_load_time_initializer.update_stat(NUM_HAPLOTYPES_IDX, tc_vector.size()); 
      struct timespec start_time;
 #ifdef USE_PAPI
      assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
@ -429,7 +493,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
 #ifdef USE_PAPI
      assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
 #endif
-      vector_compute_time +=  diff_time(start_time);
+      uint64_t curr_interval = diff_time(start_time);
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+      times_fptr << curr_interval << "\n";
+#endif
+      vector_compute_time +=  curr_interval;
 #ifdef USE_PAPI
      for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
        accum_values[k] += values[k];
@ -488,9 +556,12 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
  for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
    cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
 #endif
-
+#ifdef PRINT_PER_INTERVAL_TIMINGS
+  times_fptr.close();
+#endif
  if(use_old_read_testcase)
    fclose(fptr);
  else
    ifptr.close();
+  g_load_time_initializer.print_profiling();
 }
--- a/public/VectorPairHMM/src/main/c++/utils.h
+++ b/public/VectorPairHMM/src/main/c++/utils.h
@ -26,7 +26,7 @@
 #ifndef PAIRHMM_UTIL_H
 #define PAIRHMM_UTIL_H

-#include "template.h"
+#include "common_data_structure.h"

 template<class T>
 std::string to_string(T obj)
@ -50,6 +50,15 @@ extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_lo
 void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
 template<class NUMBER>
 NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_avxd(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_avxs(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_ssed(testcase *tc, NUMBER *before_last_log=0);
+template<class NUMBER>
+NUMBER compute_full_prob_sses(testcase *tc, NUMBER *before_last_log=0);
+
 double getCurrClk();
 void get_time(struct timespec* x);
 uint64_t diff_time(struct timespec& prev_time);
@ -71,5 +80,6 @@ void do_compute(char* filename, bool use_old_read_testcase=true, unsigned chunk_
 /*#define DUMP_COMPUTE_VALUES 1*/
 #define BATCH_SIZE  10000
 #define RUN_HYBRID
+/*#define PRINT_PER_INTERVAL_TIMINGS 1*/

 #endif
--- a/public/VectorPairHMM/src/main/c++/vector_defs.h
+++ b/public/VectorPairHMM/src/main/c++/vector_defs.h
@ -1,55 +0,0 @@
-/*Copyright (c) 2012 The Broad Institute
-
-*Permission is hereby granted, free of charge, to any person
-*obtaining a copy of this software and associated documentation
-*files (the "Software"), to deal in the Software without
-*restriction, including without limitation the rights to use,
-*copy, modify, merge, publish, distribute, sublicense, and/or sell
-*copies of the Software, and to permit persons to whom the
-*Software is furnished to do so, subject to the following
-*conditions:
-
-*The above copyright notice and this permission notice shall be
-*included in all copies or substantial portions of the Software.
-
-*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-#undef SIMD_ENGINE
-#undef SIMD_ENGINE_AVX
-#undef SIMD_ENGINE_SSE
-
-#define SIMD_ENGINE avx
-#define SIMD_ENGINE_AVX
-
-#include "define-float.h"
-#include "vector_function_prototypes.h"
-
-#include "define-double.h"
-#include "vector_function_prototypes.h"
-
-#undef  SIMD_ENGINE
-#undef  SIMD_ENGINE_AVX
-
-#define SIMD_ENGINE sse
-#define SIMD_ENGINE_SSE
-
-
-#include "define-sse-float.h"
-#include "vector_function_prototypes.h"
-
-#include "define-sse-double.h"
-#include "vector_function_prototypes.h"
-
-#undef SIMD_ENGINE
-#undef SIMD_ENGINE_AVX
-#undef SIMD_ENGINE_SSE
-
--- a/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h
+++ b/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h
@ -1,44 +0,0 @@
-/*Copyright (c) 2012 The Broad Institute
-
-*Permission is hereby granted, free of charge, to any person
-*obtaining a copy of this software and associated documentation
-*files (the "Software"), to deal in the Software without
-*restriction, including without limitation the rights to use,
-*copy, modify, merge, publish, distribute, sublicense, and/or sell
-*copies of the Software, and to permit persons to whom the
-*Software is furnished to do so, subject to the following
-*conditions:
-
-*The above copyright notice and this permission notice shall be
-*included in all copies or substantial portions of the Software.
-
-*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
-inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
-inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
-inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
-inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
-inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
-template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc,  SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
-template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
-    int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
-    SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm,  SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , 
-    SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, 
-    UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
-inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, 
-    SIMD_TYPE distm, SIMD_TYPE _1_distm);
-inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, 
-    UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, 
-    SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
-template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);
-
--- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
+++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pairhmm/PairHMM.java
@ -78,8 +78,9 @@ public abstract class PairHMM {
    protected double[] mLikelihoodArray;

    //profiling information
-    protected static final boolean doProfiling = true;
-    protected long computeTime = 0;
+    protected static Boolean doProfiling = true;
+    protected static long pairHMMComputeTime = 0;
+    protected long threadLocalPairHMMComputeTimeDiff = 0;
    protected long startTime = 0;

    /**
@ -207,7 +208,13 @@ public abstract class PairHMM {
            }
        }
        if(doProfiling)
-            computeTime += (System.nanoTime() - startTime);
+        {
+            threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
+            //synchronized(doProfiling)
+            {
+                pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
+            }
+        }
        return likelihoodMap;
    }

@ -314,6 +321,17 @@ public abstract class PairHMM {
        return Math.min(haplotype1.length, haplotype2.length);
    }

+    /**
+     * Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1
+     * This function should be called only during initialization phase - single thread phase of HC
+     */
+    public static void setNumberOfThreads(final int numThreads)
+    {
+        doProfiling = (numThreads == 1);
+        if(numThreads > 1)
+            logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n");
+    }
+
    /**
     * Return the results of the computeLikelihoods function
     */
@ -324,6 +342,6 @@ public abstract class PairHMM {
    public void close()
    {
        if(doProfiling)
-            System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9));
+            System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9));
    }
 }
--- a/public/sting-utils/src/main/resources/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so
+++ b/public/sting-utils/src/main/resources/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so