Parallel version of the JNI for the PairHMM

The JNI treats shared memory as critical memory and doesn't allow any
parallel reads or writes to it until the native code finishes. This is
not a problem *per se* it is the right thing to do, but we need to
enable **-nct** when running the haplotype caller and with it have
multiple native PairHMM running for each map call.

Move to a copy based memory sharing where the JNI simply copies the
memory over to C++ and then has no blocked critical memory when running,
allowing -nct to work.

This version is slightly (almost unnoticeably) slower with -nct 1, but
scales better with -nct 2-4 (we haven't tested anything beyond that
because we know the GATK falls apart with higher levels of parallelism

* Make VECTOR_LOGLESS_CACHING the default implementation for PairHMM.
* Changed version number in pom.xml under public/VectorPairHMM
* VectorPairHMM can now be compiled using gcc 4.8.x
* Modified define-* to get rid of gcc warnings for extra tokens after #undefs
* Added a Linux kernel version check for AVX - gcc's __builtin_cpu_supports function does not check whether the kernel supports AVX or not.
* Updated PairHMM profiling code to update and print numbers only in single-thread mode
* Edited README.md, pom.xml and Makefile for users to pass path to gcc 4.8.x if necessary
* Moved all cpuid inline assembly to single function Changed info message to clog from cinfo
* Modified version in pom.xml in VectorPairHMM from 3.1 to 3.2
* Deleted some unnecessary code
* Modified C++ sandbox to print per interval timing
This commit is contained in:
Karthik Gururaj 2014-03-17 11:42:19 -07:00 committed by Mauricio Carneiro
parent 38b7cfbda9
commit f6ea25b4d1
31 changed files with 548 additions and 527 deletions

View File

@ -417,7 +417,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
*/
@Hidden
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
@ -639,6 +639,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
}
//static member function - set number of threads
PairHMM.setNumberOfThreads(getToolkit().getTotalNumberOfThreads());
// create our likelihood calculation engine
likelihoodCalculationEngine = createLikelihoodCalculationEngine();

View File

@ -58,6 +58,7 @@ import java.util.HashMap;
*/
public abstract class JNILoglessPairHMM extends LoglessPairHMM {
public abstract HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap();
protected long setupTime = 0;
protected long threadLocalSetupTimeDiff = 0;
protected static long pairHMMSetupTime = 0;
}

View File

@ -228,7 +228,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
mLikelihoodArray = new double[readListSize*numHaplotypes]; //to store results
if(doProfiling)
setupTime += (System.nanoTime() - startTime);
threadLocalSetupTimeDiff = (System.nanoTime() - startTime);
//for(reads)
// for(haplotypes)
// compute_full_prob()
@ -251,7 +251,14 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
readIdx += numHaplotypes;
}
if(doProfiling)
computeTime += (System.nanoTime() - startTime);
{
threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
//synchronized(doProfiling)
{
pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
pairHMMSetupTime += threadLocalSetupTimeDiff;
}
}
return likelihoodMap;
}
@ -262,7 +269,8 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
@Override
public void close()
{
System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9));
if(doProfiling)
System.out.println("Time spent in setup for JNI call : "+(pairHMMSetupTime*1e-9));
super.close();
jniClose();
}

View File

@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("4ccdbebcfd02be87ae5b4ad94666f011"));
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("214799b5d1c0809f042c557155ea6e13"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "65c316f1f3987d7bc94e887999920d45");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d1626d5fff419b8a782de954224e881f");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("7592274ecd2b5ef4624dd2ed659536ef"));
Arrays.asList("1b41bf83b9c249a9a5fffb1e308668c1"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}

View File

@ -44,9 +44,10 @@ end of every region (line 351 in PairHMMLikelihoodCalculationEngine).
Note: Debug code has been moved to a separate class DebugJNILoglessPairHMM.java.
Compiling:
Make sure you have icc (Intel C compiler) available. Currently, gcc does not seem to support all AVX
intrinsics.
This native library is called libVectorLoglessPairHMM.so
The native library (called libVectorLoglessPairHMM.so) can be compiled with icc (Intel C compiler)
or gcc versions >= 4.8.1 that support AVX intrinsics. By default, the make process tries to invoke
icc. To use gcc, edit the file 'pom.xml' (in this directory) and enable the environment variables
USE_GCC,C_COMPILER and CPP_COMPILER (edit and uncomment lines 60-62).
Using Maven:
Type 'mvn install' in this directory - this will build the library (by invoking 'make') and copy the
native library to the directory

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.sting</groupId>
<artifactId>sting-root</artifactId>
<version>2.8-SNAPSHOT</version>
<version>3.2-SNAPSHOT</version>
<relativePath>../../public/sting-root</relativePath>
</parent>
@ -57,6 +57,9 @@
<workingDirectory>src/main/c++</workingDirectory>
<environmentVariables>
<JRE_HOME>${java.home}</JRE_HOME>
<!--<USE_GCC>1</USE_GCC>-->
<!--<C_COMPILER>/opt/gcc-4.8.2/bin/gcc</C_COMPILER>-->
<!--<CPP_COMPILER>/opt/gcc-4.8.2/bin/g++</CPP_COMPILER>-->
<OUTPUT_DIR>${project.build.directory}</OUTPUT_DIR>
</environmentVariables>
</configuration>

View File

@ -23,8 +23,8 @@
*/
#include "LoadTimeInitializer.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
char* LoadTimeInitializerStatsNames[] =
{
@ -43,6 +43,10 @@ LoadTimeInitializer g_load_time_initializer;
LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded
{
#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__INTEL_COMPILER)
//compiles only with gcc >= 4.8
__builtin_cpu_init();
#endif
ConvertChar::init();
#ifndef DISABLE_FTZ
//Very important to get good performance on Intel processors
@ -71,11 +75,6 @@ LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loa
m_filename_to_fptr.clear();
m_written_files_set.clear();
//Common buffer - 8MB
unsigned size = 1024*1024;
m_buffer = new uint64_t[size];
m_buffer_size = size*sizeof(uint64_t);
initialize_function_pointers();
//Initialize static members of class

View File

@ -27,7 +27,7 @@
#define LOAD_TIME_INITIALIZER_H
#include "headers.h"
#include <jni.h>
#include "template.h"
/*#include "template.h"*/
enum LoadTimeInitializerStatsEnum
{
@ -47,10 +47,6 @@ class LoadTimeInitializer
{
public:
LoadTimeInitializer(); //will be called when library is loaded
~LoadTimeInitializer()
{
delete[] m_buffer;
}
void print_profiling();
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
void debug_close();
@ -72,8 +68,6 @@ class LoadTimeInitializer
uint64_t m_data_transfer_time;
//bytes copied
uint64_t m_bytes_copied;
unsigned get_buffer_size() { return m_buffer_size; }
char* get_buffer() { return (char*)m_buffer; }
private:
std::map<std::string, std::ofstream*> m_filename_to_fptr;
std::set<std::string> m_written_files_set;
@ -83,8 +77,6 @@ class LoadTimeInitializer
double m_sum_square_stats[TOTAL_NUMBER_STATS];
uint64_t m_min_stats[TOTAL_NUMBER_STATS];
uint64_t m_max_stats[TOTAL_NUMBER_STATS];
unsigned m_buffer_size;
uint64_t* m_buffer;
};
extern LoadTimeInitializer g_load_time_initializer;

View File

@ -32,14 +32,30 @@
JRE_HOME?=/opt/jdk1.7.0_25/jre
JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JRE_HOME}/../include -I${JRE_HOME}/../include/linux
COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
CC=icc
CXX=icc
#COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -Wall $(OMPCFLAGS) -Wno-unknown-pragmas -Wno-write-strings -Wno-unused-variable -Wno-unused-but-set-variable
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ
endif
ifdef USE_GCC
C_COMPILER?=gcc
CPP_COMPILER?=g++
AVX_FLAGS=-mavx
SSE41_FLAGS=-msse4.1
COMMON_COMPILATION_FLAGS+=-Wno-char-subscripts
else
C_COMPILER?=icc
CPP_COMPILER?=icc
AVX_FLAGS=-xAVX
SSE41_FLAGS=-xSSE4.1
LIBFLAGS=-static-intel
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-no-ftz
endif
endif
LDFLAGS=-lm -lrt $(OMPLDFLAGS)
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
endif
PAPI_DIR=/home/karthikg/softwares/papi-5.3.0
ifdef USE_PAPI
@ -49,10 +65,6 @@ ifdef USE_PAPI
endif
endif
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
endif
BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker
#BIN=checker
@ -79,8 +91,8 @@ NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(AVX_FLAGS)
$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(SSE41_FLAGS)
OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)
all: $(BIN) Sandbox.class copied_lib
@ -88,18 +100,18 @@ all: $(BIN) Sandbox.class copied_lib
-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
checker: pairhmm-1-base.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
libVectorLoglessPairHMM.so: $(LIBOBJECTS)
$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
$(CPP_COMPILER) $(OMPLFLAGS) -shared $(LIBFLAGS) -o $@ $(LIBOBJECTS) ${LDFLAGS}
$(OBJECTS): %.o: %.cc
@mkdir -p $(DEPDIR)
$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
$(CPP_COMPILER) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
Sandbox.class: Sandbox.java
javac Sandbox.java

View File

@ -23,7 +23,6 @@
*/
#include "template.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_SSE
@ -31,6 +30,8 @@
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "template.h"
#include "define-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"

View File

@ -24,7 +24,7 @@
#include "headers.h"
#include "template.h"
#include "common_data_structure.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
@ -48,17 +48,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
//a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
//the chance of being stored together in the cache. Also, prefetchers can learn memory access
//patterns for 2D arrays, not possible for array of pointers
//bool locally_allocated = false;
//NUMBER* common_buffer = 0;
NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
//if(true)
//{
//common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//locally_allocated = true;
//}
//else
//common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer());
//pointers to within the allocated buffer
NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
NUMBER* ptr = common_buffer;
@ -154,7 +145,6 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
#ifndef USE_STACK_ALLOCATION
delete[] common_pointer_buffer;
//if(locally_allocated)
delete[] common_buffer;
#endif

View File

@ -0,0 +1,215 @@
#ifndef COMMON_DATA_STRUCTURE_H
#define COMMON_DATA_STRUCTURE_H
#include "headers.h"
#define CAT(X,Y) X####Y
#define CONCAT(X,Y) CAT(X,Y)
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
//#define MROWS 500
//#define MCOLS 1000
#define MAX_QUAL 254
#define MAX_JACOBIAN_TOLERANCE 8.0
#define JACOBIAN_LOG_TABLE_STEP 0.0001
#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
#define MAXN 70000
#define LOG10_CACHE_SIZE (4*MAXN) // we need to be able to go up to 2*(2N) when calculating some of the coefficients
#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
template<class NUMBER>
struct ContextBase
{
public:
NUMBER ph2pr[128];
NUMBER INITIAL_CONSTANT;
NUMBER LOG10_INITIAL_CONSTANT;
NUMBER RESULT_THRESHOLD;
static bool staticMembersInitializedFlag;
static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
static void initializeStaticMembers()
{
if(!staticMembersInitializedFlag)
{
//Order of calls important - Jacobian first, then MatchToMatch
initializeJacobianLogTable();
initializeMatchToMatchProb();
staticMembersInitializedFlag = true;
}
}
static void deleteStaticMembers()
{
if(staticMembersInitializedFlag)
{
staticMembersInitializedFlag = false;
}
}
//Called only once during library load - don't bother to optimize with single precision fp
static void initializeJacobianLogTable()
{
for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
}
}
//Called only once per library load - don't bother optimizing with single fp
static void initializeMatchToMatchProb()
{
double LN10 = log(10);
double INV_LN10 = 1.0/LN10;
for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
for (int j = 0; j <= i; j++) {
double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
double matchToMatchLog10 =
log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
}
}
//Called during computation - use single precision where possible
static int fastRound(NUMBER d) {
return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
}
//Called during computation - use single precision where possible
static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
// make sure small is really the smaller value
if (small > big) {
NUMBER t = big;
big = small;
small = t;
}
if (isinf(small) == -1 || isinf(big) == -1)
return big;
NUMBER diff = big - small;
if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
return big + jacobianLogTable[ind];
}
};
template<class NUMBER>
struct Context : public ContextBase<NUMBER>
{};
template<>
struct Context<double> : public ContextBase<double>
{
Context():ContextBase<double>()
{
for (int x = 0; x < 128; x++)
ph2pr[x] = pow(10.0, -((double)x) / 10.0);
INITIAL_CONSTANT = ldexp(1.0, 1020.0);
LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
RESULT_THRESHOLD = 0.0;
}
double LOG10(double v){ return log10(v); }
inline double POW(double b, double e) { return pow(b,e); }
static double _(double n){ return n; }
static double _(float n){ return ((double) n); }
};
template<>
struct Context<float> : public ContextBase<float>
{
Context() : ContextBase<float>()
{
for (int x = 0; x < 128; x++)
{
ph2pr[x] = powf(10.f, -((float)x) / 10.f);
}
INITIAL_CONSTANT = ldexpf(1.f, 120.f);
LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
RESULT_THRESHOLD = ldexpf(1.f, -110.f);
}
float LOG10(float v){ return log10f(v); }
inline float POW(float b, float e) { return powf(b,e); }
static float _(double n){ return ((float) n); }
static float _(float n){ return n; }
};
#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual) \
{ \
int minQual = delQual; \
int maxQual = insQual; \
if (insQual <= delQual) \
{ \
minQual = insQual; \
maxQual = delQual; \
} \
(output) = (MAX_QUAL < maxQual) ? \
((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual)) \
: ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; \
}
typedef struct
{
int rslen, haplen;
/*int *q, *i, *d, *c;*/
/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
char *q, *i, *d, *c;
char *hap, *rs;
int *ihap;
int *irs;
} testcase;
#define MIN_ACCEPTED 1e-28f
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
class ConvertChar {
static uint8_t conversionTable[255] ;
public:
static void init() {
assert (NUM_DISTINCT_CHARS == 5) ;
assert (AMBIG_CHAR == 4) ;
conversionTable['A'] = 0 ;
conversionTable['C'] = 1 ;
conversionTable['T'] = 2 ;
conversionTable['G'] = 3 ;
conversionTable['N'] = 4 ;
}
static inline uint8_t get(uint8_t input) {
return conversionTable[input] ;
}
};
int normalize(char c);
int read_testcase(testcase *tc, FILE* ifp=0);
#endif

View File

@ -42,33 +42,33 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -42,33 +42,33 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -40,35 +40,35 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef VEC_EXTRACT_UNIT
#undef VEC_INSERT_UNIT
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -40,35 +40,35 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef VEC_EXTRACT_UNIT
#undef VEC_INSERT_UNIT
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -37,7 +37,7 @@
/*#define DUMP_TO_SANDBOX 1*/
#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
/*#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1*/
#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
//Gets direct access to Java arrays

View File

@ -26,7 +26,6 @@
#include "headers.h"
#include "jni_common.h"
#include "org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h"
#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
#include "jnidebug.h"

View File

@ -26,7 +26,7 @@
#include "headers.h"
#include "jni_common.h"
#include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h"
#include "template.h"
//#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"

View File

@ -29,7 +29,6 @@ using namespace std;
int main(int argc, char** argv)
{
#define BATCH_SIZE 10000
if(argc < 2)
{
cerr << "Needs path to input file as argument\n";

View File

@ -24,12 +24,11 @@
#include "headers.h"
#include "template.h"
#include "vector_defs.h"
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "utils.h"
#define BATCH_SIZE 10000
#define RUN_HYBRID

View File

@ -23,7 +23,6 @@
*/
#include "template.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
@ -31,6 +30,8 @@
#define SIMD_ENGINE sse
#define SIMD_ENGINE_SSE
#include "template.h"
#include "define-sse-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"

View File

@ -28,27 +28,17 @@
#include "headers.h"
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
//#define MROWS 500
//#define MCOLS 1000
#define CAT(X,Y) X####Y
#define CONCAT(X,Y) CAT(X,Y)
#define ALIGNED __attribute__((aligned(32)))
#ifdef SIMD_ENGINE_AVX
typedef union __attribute__((aligned(32))) {
ALIGNED __m256 ALIGNED d;
ALIGNED __m128i ALIGNED s[2];
ALIGNED float ALIGNED f[8];
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_F ALIGNED;
#endif
typedef union __attribute__((aligned(32))) {
ALIGNED __m128 ALIGNED d;
@ -81,12 +71,14 @@ typedef union ALIGNED
ALIGNED float ALIGNED f;
} ALIGNED IF_32 ALIGNED;
#ifdef SIMD_ENGINE_AVX
typedef union __attribute__((aligned(32))) {
ALIGNED __m256d ALIGNED d;
ALIGNED __m128i ALIGNED s[2];
ALIGNED double ALIGNED f[4];
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_D ALIGNED;
#endif
typedef union __attribute__((aligned(32))) {
ALIGNED __m128d ALIGNED d;
@ -120,200 +112,7 @@ typedef union ALIGNED
} ALIGNED IF_64 ALIGNED;
#define MAX_QUAL 254
#define MAX_JACOBIAN_TOLERANCE 8.0
#define JACOBIAN_LOG_TABLE_STEP 0.0001
#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
#define MAXN 70000
#define LOG10_CACHE_SIZE (4*MAXN) // we need to be able to go up to 2*(2N) when calculating some of the coefficients
#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
template<class NUMBER>
struct ContextBase
{
public:
NUMBER ph2pr[128];
NUMBER INITIAL_CONSTANT;
NUMBER LOG10_INITIAL_CONSTANT;
NUMBER RESULT_THRESHOLD;
static bool staticMembersInitializedFlag;
static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
static void initializeStaticMembers()
{
if(!staticMembersInitializedFlag)
{
//Order of calls important - Jacobian first, then MatchToMatch
initializeJacobianLogTable();
initializeMatchToMatchProb();
staticMembersInitializedFlag = true;
}
}
static void deleteStaticMembers()
{
if(staticMembersInitializedFlag)
{
staticMembersInitializedFlag = false;
}
}
//Called only once during library load - don't bother to optimize with single precision fp
static void initializeJacobianLogTable()
{
for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
}
}
//Called only once per library load - don't bother optimizing with single fp
static void initializeMatchToMatchProb()
{
double LN10 = log(10);
double INV_LN10 = 1.0/LN10;
for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
for (int j = 0; j <= i; j++) {
double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
double matchToMatchLog10 =
log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
}
}
//Called during computation - use single precision where possible
static int fastRound(NUMBER d) {
return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
}
//Called during computation - use single precision where possible
static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
// make sure small is really the smaller value
if (small > big) {
NUMBER t = big;
big = small;
small = t;
}
if (isinf(small) == -1 || isinf(big) == -1)
return big;
NUMBER diff = big - small;
if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
return big + jacobianLogTable[ind];
}
};
template<class NUMBER>
struct Context : public ContextBase<NUMBER>
{};
template<>
struct Context<double> : public ContextBase<double>
{
Context():ContextBase<double>()
{
for (int x = 0; x < 128; x++)
ph2pr[x] = pow(10.0, -((double)x) / 10.0);
INITIAL_CONSTANT = ldexp(1.0, 1020.0);
LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
RESULT_THRESHOLD = 0.0;
}
double LOG10(double v){ return log10(v); }
inline double POW(double b, double e) { return pow(b,e); }
static double _(double n){ return n; }
static double _(float n){ return ((double) n); }
};
template<>
struct Context<float> : public ContextBase<float>
{
Context() : ContextBase<float>()
{
for (int x = 0; x < 128; x++)
{
ph2pr[x] = powf(10.f, -((float)x) / 10.f);
}
INITIAL_CONSTANT = ldexpf(1.f, 120.f);
LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
RESULT_THRESHOLD = ldexpf(1.f, -110.f);
}
float LOG10(float v){ return log10f(v); }
inline float POW(float b, float e) { return powf(b,e); }
static float _(double n){ return ((float) n); }
static float _(float n){ return n; }
};
#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual) \
{ \
int minQual = delQual; \
int maxQual = insQual; \
if (insQual <= delQual) \
{ \
minQual = insQual; \
maxQual = delQual; \
} \
(output) = (MAX_QUAL < maxQual) ? \
((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual)) \
: ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; \
}
typedef struct
{
int rslen, haplen;
/*int *q, *i, *d, *c;*/
/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
char *q, *i, *d, *c;
char *hap, *rs;
int *ihap;
int *irs;
} testcase;
int normalize(char c);
int read_testcase(testcase *tc, FILE* ifp=0);
#define MIN_ACCEPTED 1e-28f
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
class ConvertChar {
static uint8_t conversionTable[255] ;
public:
static void init() {
assert (NUM_DISTINCT_CHARS == 5) ;
assert (AMBIG_CHAR == 4) ;
conversionTable['A'] = 0 ;
conversionTable['C'] = 1 ;
conversionTable['T'] = 2 ;
conversionTable['G'] = 3 ;
conversionTable['N'] = 4 ;
}
static inline uint8_t get(uint8_t input) {
return conversionTable[input] ;
}
};
#include "common_data_structure.h"
#endif

View File

@ -22,11 +22,8 @@
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "headers.h"
#include "template.h"
#include "utils.h"
#include "vector_defs.h"
#include "LoadTimeInitializer.h"
using namespace std;
@ -36,51 +33,107 @@ uint8_t ConvertChar::conversionTable[255];
float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
//Static members in ContextBase
template<>
bool ContextBase<double>::staticMembersInitializedFlag = false;
double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
template<>
double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
template<>
double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
template<>
bool ContextBase<float>::staticMembersInitializedFlag = false;
float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
template<>
float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
template<>
float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
bool search_file_for_string(string filename, string search_string)
{
ifstream fptr;
fptr.open(filename.c_str(),ios::in);
if(fptr.is_open())
{
string buffer;
buffer.clear();
buffer.resize(4096);
bool retvalue = false;
while(!fptr.eof())
{
fptr.getline(&(buffer[0]), 4096);
if(buffer.find(search_string) != string::npos) //found string
{
retvalue = true;
break;
}
}
buffer.clear();
fptr.close();
return retvalue;
}
else
return false;
}
bool is_cpuid_ecx_bit_set(int eax, int bitidx)
{
int ecx = 0, edx = 0, ebx = 0;
__asm__ ("cpuid"
:"=b" (ebx),
"=c" (ecx),
"=d" (edx)
:"a" (eax)
);
return (((ecx >> bitidx)&1) == 1);
}
bool is_avx_supported()
{
return (_may_i_use_cpu_feature(_FEATURE_AVX) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 28)&1) == 1;
#ifdef __INTEL_COMPILER
bool use_avx = _may_i_use_cpu_feature(_FEATURE_AVX);
if(use_avx)
return true;
else
{
//check if core supports AVX, but kernel does not and print info message
if(!is_cpuid_ecx_bit_set(1, 28)) //core does not support AVX
return false;
//else fall through to end of function
}
#else
if(!__builtin_cpu_supports("avx")) //core does not support AVX
return false;
else
{
//core supports AVX, check if kernel supports
if(search_file_for_string("/proc/cpuinfo","avx"))
return true;
//else fall through to end of function
}
#endif //__INTEL_COMPILER
clog << "INFO: Your CPU supports AVX vector instructions, but your kernel does not. Try upgrading to a kernel that supports AVX.\n";
clog << "INFO: Your program will run correctly, but slower than the AVX version\n";
return false;
}
bool is_sse41_supported()
{
#ifdef __INTEL_COMPILER
return (_may_i_use_cpu_feature(_FEATURE_SSE4_1) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 19)&1) == 1;
#else
return __builtin_cpu_supports("sse4.1");
#endif
//return is_cpuid_ecx_bit_set(1, 19);
}
bool is_sse42_supported()
{
#ifdef __INTEL_COMPILER
return (_may_i_use_cpu_feature(_FEATURE_SSE4_2) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 20)&1) == 1;
#else
return __builtin_cpu_supports("sse4.2");
#endif
//return is_cpuid_ecx_bit_set(1, 20);
}
uint64_t get_machine_capabilities()
@ -154,10 +207,14 @@ int read_testcase(testcase *tc, FILE* ifp)
tc->haplen = strlen(tc->hap);
tc->rslen = strlen(tc->rs);
assert(strlen(q) == tc->rslen);
assert(strlen(i) == tc->rslen);
assert(strlen(d) == tc->rslen);
assert(strlen(c) == tc->rslen);
assert(strlen(q) == (size_t)tc->rslen);
assert(strlen(i) == (size_t)tc->rslen);
assert(strlen(d) == (size_t)tc->rslen);
assert(strlen(c) == (size_t)tc->rslen);
g_load_time_initializer.update_stat(READ_LENGTH_IDX, tc->rslen);
g_load_time_initializer.update_stat(HAPLOTYPE_LENGTH_IDX, tc->haplen);
g_load_time_initializer.update_stat(PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, tc->haplen*tc->rslen);
//assert(tc->rslen < MROWS);
//tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
//tc->irs = (int *) malloc(tc->rslen*sizeof(int));
@ -279,15 +336,15 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
tc->c = new char[tc->rslen];
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
assert(tokens.size() == 2 + 4*(tc->rslen));
assert(tokens.size() == (size_t)(2 + 4*(tc->rslen)));
//assert(tc->rslen < MROWS);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
if(reformat)
@ -297,16 +354,16 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
assert(ofptr.is_open());
ofptr << tokens[0] << " ";
ofptr << tokens[1] << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->q[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->i[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->d[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->c[j]+33));
ofptr << " 0 false\n";
@ -363,6 +420,10 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
ifptr.open(filename);
assert(ifptr.is_open());
}
#ifdef PRINT_PER_INTERVAL_TIMINGS
ofstream times_fptr;
times_fptr.open("native_timed_intervals.csv",ios::out);
#endif
vector<testcase> tc_vector;
tc_vector.clear();
testcase tc;
@ -377,11 +438,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
uint32_t no_kernel_mask = (1 << 17); //bit 16 user mode, bit 17 kernel mode
PAPI_num_counters();
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" };
char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "l1_pending_miss", "lfb_hit", "l2_hit" };
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK);
assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION", &(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION", &(events[3])) == PAPI_OK);
assert(PAPI_event_name_to_code("L1D_PEND_MISS:OCCURRENCES", &(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:HIT_LFB", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:L2_HIT", &(events[3])) == PAPI_OK);
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
#endif
@ -398,6 +459,9 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
baseline_results_vec.clear();
results_vec.resize(tc_vector.size());
baseline_results_vec.resize(tc_vector.size());
g_load_time_initializer.update_stat(NUM_TESTCASES_IDX, tc_vector.size());
g_load_time_initializer.update_stat(NUM_READS_IDX, tc_vector.size());
g_load_time_initializer.update_stat(NUM_HAPLOTYPES_IDX, tc_vector.size());
struct timespec start_time;
#ifdef USE_PAPI
assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
@ -429,7 +493,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
#ifdef USE_PAPI
assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
#endif
vector_compute_time += diff_time(start_time);
uint64_t curr_interval = diff_time(start_time);
#ifdef PRINT_PER_INTERVAL_TIMINGS
times_fptr << curr_interval << "\n";
#endif
vector_compute_time += curr_interval;
#ifdef USE_PAPI
for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
accum_values[k] += values[k];
@ -488,9 +556,12 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
#endif
#ifdef PRINT_PER_INTERVAL_TIMINGS
times_fptr.close();
#endif
if(use_old_read_testcase)
fclose(fptr);
else
ifptr.close();
g_load_time_initializer.print_profiling();
}

View File

@ -26,7 +26,7 @@
#ifndef PAIRHMM_UTIL_H
#define PAIRHMM_UTIL_H
#include "template.h"
#include "common_data_structure.h"
template<class T>
std::string to_string(T obj)
@ -50,6 +50,15 @@ extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_lo
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_avxd(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_avxs(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_ssed(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_sses(testcase *tc, NUMBER *before_last_log=0);
double getCurrClk();
void get_time(struct timespec* x);
uint64_t diff_time(struct timespec& prev_time);
@ -71,5 +80,6 @@ void do_compute(char* filename, bool use_old_read_testcase=true, unsigned chunk_
/*#define DUMP_COMPUTE_VALUES 1*/
#define BATCH_SIZE 10000
#define RUN_HYBRID
/*#define PRINT_PER_INTERVAL_TIMINGS 1*/
#endif

View File

@ -1,55 +0,0 @@
/*Copyright (c) 2012 The Broad Institute
*Permission is hereby granted, free of charge, to any person
*obtaining a copy of this software and associated documentation
*files (the "Software"), to deal in the Software without
*restriction, including without limitation the rights to use,
*copy, modify, merge, publish, distribute, sublicense, and/or sell
*copies of the Software, and to permit persons to whom the
*Software is furnished to do so, subject to the following
*conditions:
*The above copyright notice and this permission notice shall be
*included in all copies or substantial portions of the Software.
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#undef SIMD_ENGINE_SSE
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "define-float.h"
#include "vector_function_prototypes.h"
#include "define-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#define SIMD_ENGINE sse
#define SIMD_ENGINE_SSE
#include "define-sse-float.h"
#include "vector_function_prototypes.h"
#include "define-sse-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#undef SIMD_ENGINE_SSE

View File

@ -1,44 +0,0 @@
/*Copyright (c) 2012 The Broad Institute
*Permission is hereby granted, free of charge, to any person
*obtaining a copy of this software and associated documentation
*files (the "Software"), to deal in the Software without
*restriction, including without limitation the rights to use,
*copy, modify, merge, publish, distribute, sublicense, and/or sell
*copies of the Software, and to permit persons to whom the
*Software is furnished to do so, subject to the following
*conditions:
*The above copyright notice and this permission notice shall be
*included in all copies or substantial portions of the Software.
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM ,
SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256,
SIMD_TYPE distm, SIMD_TYPE _1_distm);
inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);

View File

@ -78,8 +78,9 @@ public abstract class PairHMM {
protected double[] mLikelihoodArray;
//profiling information
protected static final boolean doProfiling = true;
protected long computeTime = 0;
protected static Boolean doProfiling = true;
protected static long pairHMMComputeTime = 0;
protected long threadLocalPairHMMComputeTimeDiff = 0;
protected long startTime = 0;
/**
@ -207,7 +208,13 @@ public abstract class PairHMM {
}
}
if(doProfiling)
computeTime += (System.nanoTime() - startTime);
{
threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
//synchronized(doProfiling)
{
pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
}
}
return likelihoodMap;
}
@ -314,6 +321,17 @@ public abstract class PairHMM {
return Math.min(haplotype1.length, haplotype2.length);
}
/**
* Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1
* This function should be called only during initialization phase - single thread phase of HC
*/
public static void setNumberOfThreads(final int numThreads)
{
doProfiling = (numThreads == 1);
if(numThreads > 1)
logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n");
}
/**
* Return the results of the computeLikelihoods function
*/
@ -324,6 +342,6 @@ public abstract class PairHMM {
public void close()
{
if(doProfiling)
System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9));
System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9));
}
}