Merge pull request #573 from broadinstitute/intel_pairhmm

Intel pairhmm
This commit is contained in:
MauricioCarneiro 2014-05-05 16:27:04 -04:00
commit 587e81fbd9
31 changed files with 548 additions and 527 deletions

View File

@ -417,7 +417,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
*/
@Hidden
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
@ -639,6 +639,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
logger.info("Using global mismapping rate of " + phredScaledGlobalReadMismappingRate + " => " + log10GlobalReadMismappingRate + " in log10 likelihood units");
}
//static member function - set number of threads
PairHMM.setNumberOfThreads(getToolkit().getTotalNumberOfThreads());
// create our likelihood calculation engine
likelihoodCalculationEngine = createLikelihoodCalculationEngine();

View File

@ -58,6 +58,7 @@ import java.util.HashMap;
*/
public abstract class JNILoglessPairHMM extends LoglessPairHMM {
public abstract HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap();
protected long setupTime = 0;
protected long threadLocalSetupTimeDiff = 0;
protected static long pairHMMSetupTime = 0;
}

View File

@ -228,7 +228,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
mLikelihoodArray = new double[readListSize*numHaplotypes]; //to store results
if(doProfiling)
setupTime += (System.nanoTime() - startTime);
threadLocalSetupTimeDiff = (System.nanoTime() - startTime);
//for(reads)
// for(haplotypes)
// compute_full_prob()
@ -251,7 +251,14 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
readIdx += numHaplotypes;
}
if(doProfiling)
computeTime += (System.nanoTime() - startTime);
{
threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
//synchronized(doProfiling)
{
pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
pairHMMSetupTime += threadLocalSetupTimeDiff;
}
}
return likelihoodMap;
}
@ -262,7 +269,8 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
@Override
public void close()
{
System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9));
if(doProfiling)
System.out.println("Time spent in setup for JNI call : "+(pairHMMSetupTime*1e-9));
super.close();
jniClose();
}

View File

@ -358,7 +358,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("4ccdbebcfd02be87ae5b4ad94666f011"));
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("214799b5d1c0809f042c557155ea6e13"));
specAnn.disableShadowBCF();
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "65c316f1f3987d7bc94e887999920d45");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "d1626d5fff419b8a782de954224e881f");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -284,7 +284,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("7592274ecd2b5ef4624dd2ed659536ef"));
Arrays.asList("1b41bf83b9c249a9a5fffb1e308668c1"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}

View File

@ -44,9 +44,10 @@ end of every region (line 351 in PairHMMLikelihoodCalculationEngine).
Note: Debug code has been moved to a separate class DebugJNILoglessPairHMM.java.
Compiling:
Make sure you have icc (Intel C compiler) available. Currently, gcc does not seem to support all AVX
intrinsics.
This native library is called libVectorLoglessPairHMM.so
The native library (called libVectorLoglessPairHMM.so) can be compiled with icc (Intel C compiler)
or gcc versions >= 4.8.1 that support AVX intrinsics. By default, the make process tries to invoke
icc. To use gcc, edit the file 'pom.xml' (in this directory) and enable the environment variables
USE_GCC,C_COMPILER and CPP_COMPILER (edit and uncomment lines 60-62).
Using Maven:
Type 'mvn install' in this directory - this will build the library (by invoking 'make') and copy the
native library to the directory

View File

@ -5,7 +5,7 @@
<parent>
<groupId>org.broadinstitute.sting</groupId>
<artifactId>sting-root</artifactId>
<version>2.8-SNAPSHOT</version>
<version>3.2-SNAPSHOT</version>
<relativePath>../../public/sting-root</relativePath>
</parent>
@ -57,6 +57,9 @@
<workingDirectory>src/main/c++</workingDirectory>
<environmentVariables>
<JRE_HOME>${java.home}</JRE_HOME>
<!--<USE_GCC>1</USE_GCC>-->
<!--<C_COMPILER>/opt/gcc-4.8.2/bin/gcc</C_COMPILER>-->
<!--<CPP_COMPILER>/opt/gcc-4.8.2/bin/g++</CPP_COMPILER>-->
<OUTPUT_DIR>${project.build.directory}</OUTPUT_DIR>
</environmentVariables>
</configuration>

View File

@ -23,8 +23,8 @@
*/
#include "LoadTimeInitializer.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
char* LoadTimeInitializerStatsNames[] =
{
@ -43,6 +43,10 @@ LoadTimeInitializer g_load_time_initializer;
LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded
{
#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__INTEL_COMPILER)
//compiles only with gcc >= 4.8
__builtin_cpu_init();
#endif
ConvertChar::init();
#ifndef DISABLE_FTZ
//Very important to get good performance on Intel processors
@ -71,11 +75,6 @@ LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loa
m_filename_to_fptr.clear();
m_written_files_set.clear();
//Common buffer - 8MB
unsigned size = 1024*1024;
m_buffer = new uint64_t[size];
m_buffer_size = size*sizeof(uint64_t);
initialize_function_pointers();
//Initialize static members of class

View File

@ -27,7 +27,7 @@
#define LOAD_TIME_INITIALIZER_H
#include "headers.h"
#include <jni.h>
#include "template.h"
/*#include "template.h"*/
enum LoadTimeInitializerStatsEnum
{
@ -47,10 +47,6 @@ class LoadTimeInitializer
{
public:
LoadTimeInitializer(); //will be called when library is loaded
~LoadTimeInitializer()
{
delete[] m_buffer;
}
void print_profiling();
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true);
void debug_close();
@ -72,8 +68,6 @@ class LoadTimeInitializer
uint64_t m_data_transfer_time;
//bytes copied
uint64_t m_bytes_copied;
unsigned get_buffer_size() { return m_buffer_size; }
char* get_buffer() { return (char*)m_buffer; }
private:
std::map<std::string, std::ofstream*> m_filename_to_fptr;
std::set<std::string> m_written_files_set;
@ -83,8 +77,6 @@ class LoadTimeInitializer
double m_sum_square_stats[TOTAL_NUMBER_STATS];
uint64_t m_min_stats[TOTAL_NUMBER_STATS];
uint64_t m_max_stats[TOTAL_NUMBER_STATS];
unsigned m_buffer_size;
uint64_t* m_buffer;
};
extern LoadTimeInitializer g_load_time_initializer;

View File

@ -32,14 +32,30 @@
JRE_HOME?=/opt/jdk1.7.0_25/jre
JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JRE_HOME}/../include -I${JRE_HOME}/../include/linux
COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
CC=icc
CXX=icc
#COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas
COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -Wall $(OMPCFLAGS) -Wno-unknown-pragmas -Wno-write-strings -Wno-unused-variable -Wno-unused-but-set-variable
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ
endif
ifdef USE_GCC
C_COMPILER?=gcc
CPP_COMPILER?=g++
AVX_FLAGS=-mavx
SSE41_FLAGS=-msse4.1
COMMON_COMPILATION_FLAGS+=-Wno-char-subscripts
else
C_COMPILER?=icc
CPP_COMPILER?=icc
AVX_FLAGS=-xAVX
SSE41_FLAGS=-xSSE4.1
LIBFLAGS=-static-intel
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-no-ftz
endif
endif
LDFLAGS=-lm -lrt $(OMPLDFLAGS)
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
endif
PAPI_DIR=/home/karthikg/softwares/papi-5.3.0
ifdef USE_PAPI
@ -49,10 +65,6 @@ ifdef USE_PAPI
endif
endif
ifdef DISABLE_FTZ
COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz
endif
BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker
#BIN=checker
@ -79,8 +91,8 @@ NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o)
AVX_OBJECTS=$(AVX_SOURCES:.cc=.o)
SSE_OBJECTS=$(SSE_SOURCES:.cc=.o)
$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS)
$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX
$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2
$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(AVX_FLAGS)
$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) $(SSE41_FLAGS)
OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS)
all: $(BIN) Sandbox.class copied_lib
@ -88,18 +100,18 @@ all: $(BIN) Sandbox.class copied_lib
-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d))
checker: pairhmm-1-base.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS)
$(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
$(CPP_COMPILER) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS)
libVectorLoglessPairHMM.so: $(LIBOBJECTS)
$(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS}
$(CPP_COMPILER) $(OMPLFLAGS) -shared $(LIBFLAGS) -o $@ $(LIBOBJECTS) ${LDFLAGS}
$(OBJECTS): %.o: %.cc
@mkdir -p $(DEPDIR)
$(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
$(CPP_COMPILER) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $<
Sandbox.class: Sandbox.java
javac Sandbox.java

View File

@ -23,7 +23,6 @@
*/
#include "template.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_SSE
@ -31,6 +30,8 @@
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "template.h"
#include "define-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"

View File

@ -24,7 +24,7 @@
#include "headers.h"
#include "template.h"
#include "common_data_structure.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
using namespace std;
@ -48,17 +48,8 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
//a straightforward array of pointers ensures that all data lies 'close' in memory, increasing
//the chance of being stored together in the cache. Also, prefetchers can learn memory access
//patterns for 2D arrays, not possible for array of pointers
//bool locally_allocated = false;
//NUMBER* common_buffer = 0;
NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6);
//if(true)
//{
//common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6];
//locally_allocated = true;
//}
//else
//common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer());
//pointers to within the allocated buffer
NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS];
NUMBER* ptr = common_buffer;
@ -154,7 +145,6 @@ NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log)
#ifndef USE_STACK_ALLOCATION
delete[] common_pointer_buffer;
//if(locally_allocated)
delete[] common_buffer;
#endif

View File

@ -0,0 +1,215 @@
#ifndef COMMON_DATA_STRUCTURE_H
#define COMMON_DATA_STRUCTURE_H
#include "headers.h"
#define CAT(X,Y) X####Y
#define CONCAT(X,Y) CAT(X,Y)
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
//#define MROWS 500
//#define MCOLS 1000
#define MAX_QUAL 254
#define MAX_JACOBIAN_TOLERANCE 8.0
#define JACOBIAN_LOG_TABLE_STEP 0.0001
#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
#define MAXN 70000
#define LOG10_CACHE_SIZE (4*MAXN) // we need to be able to go up to 2*(2N) when calculating some of the coefficients
#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
template<class NUMBER>
struct ContextBase
{
public:
NUMBER ph2pr[128];
NUMBER INITIAL_CONSTANT;
NUMBER LOG10_INITIAL_CONSTANT;
NUMBER RESULT_THRESHOLD;
static bool staticMembersInitializedFlag;
static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
static void initializeStaticMembers()
{
if(!staticMembersInitializedFlag)
{
//Order of calls important - Jacobian first, then MatchToMatch
initializeJacobianLogTable();
initializeMatchToMatchProb();
staticMembersInitializedFlag = true;
}
}
static void deleteStaticMembers()
{
if(staticMembersInitializedFlag)
{
staticMembersInitializedFlag = false;
}
}
//Called only once during library load - don't bother to optimize with single precision fp
static void initializeJacobianLogTable()
{
for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
}
}
//Called only once per library load - don't bother optimizing with single fp
static void initializeMatchToMatchProb()
{
double LN10 = log(10);
double INV_LN10 = 1.0/LN10;
for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
for (int j = 0; j <= i; j++) {
double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
double matchToMatchLog10 =
log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
}
}
//Called during computation - use single precision where possible
static int fastRound(NUMBER d) {
return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
}
//Called during computation - use single precision where possible
static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
// make sure small is really the smaller value
if (small > big) {
NUMBER t = big;
big = small;
small = t;
}
if (isinf(small) == -1 || isinf(big) == -1)
return big;
NUMBER diff = big - small;
if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
return big + jacobianLogTable[ind];
}
};
template<class NUMBER>
struct Context : public ContextBase<NUMBER>
{};
template<>
struct Context<double> : public ContextBase<double>
{
Context():ContextBase<double>()
{
for (int x = 0; x < 128; x++)
ph2pr[x] = pow(10.0, -((double)x) / 10.0);
INITIAL_CONSTANT = ldexp(1.0, 1020.0);
LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
RESULT_THRESHOLD = 0.0;
}
double LOG10(double v){ return log10(v); }
inline double POW(double b, double e) { return pow(b,e); }
static double _(double n){ return n; }
static double _(float n){ return ((double) n); }
};
template<>
struct Context<float> : public ContextBase<float>
{
Context() : ContextBase<float>()
{
for (int x = 0; x < 128; x++)
{
ph2pr[x] = powf(10.f, -((float)x) / 10.f);
}
INITIAL_CONSTANT = ldexpf(1.f, 120.f);
LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
RESULT_THRESHOLD = ldexpf(1.f, -110.f);
}
float LOG10(float v){ return log10f(v); }
inline float POW(float b, float e) { return powf(b,e); }
static float _(double n){ return ((float) n); }
static float _(float n){ return n; }
};
#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual) \
{ \
int minQual = delQual; \
int maxQual = insQual; \
if (insQual <= delQual) \
{ \
minQual = insQual; \
maxQual = delQual; \
} \
(output) = (MAX_QUAL < maxQual) ? \
((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual)) \
: ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; \
}
typedef struct
{
int rslen, haplen;
/*int *q, *i, *d, *c;*/
/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
char *q, *i, *d, *c;
char *hap, *rs;
int *ihap;
int *irs;
} testcase;
#define MIN_ACCEPTED 1e-28f
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
class ConvertChar {
static uint8_t conversionTable[255] ;
public:
static void init() {
assert (NUM_DISTINCT_CHARS == 5) ;
assert (AMBIG_CHAR == 4) ;
conversionTable['A'] = 0 ;
conversionTable['C'] = 1 ;
conversionTable['T'] = 2 ;
conversionTable['G'] = 3 ;
conversionTable['N'] = 4 ;
}
static inline uint8_t get(uint8_t input) {
return conversionTable[input] ;
}
};
int normalize(char c);
int read_testcase(testcase *tc, FILE* ifp=0);
#endif

View File

@ -42,33 +42,33 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -42,33 +42,33 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -40,35 +40,35 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef VEC_EXTRACT_UNIT
#undef VEC_INSERT_UNIT
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -40,35 +40,35 @@
#undef MASK_TYPE
#undef MASK_ALL_ONES
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_INSERT_UNIT(__v1,__ins,__im)
#undef SET_VEC_ZERO(__vec)
#undef VEC_OR(__v1, __v2)
#undef VEC_ADD(__v1, __v2)
#undef VEC_SUB(__v1, __v2)
#undef VEC_MUL(__v1, __v2)
#undef VEC_DIV(__v1, __v2)
#undef VEC_BLEND(__v1, __v2, __mask)
#undef VEC_BLENDV(__v1, __v2, __maskV)
#undef VEC_CAST_256_128(__v1)
#undef VEC_EXTRACT_128(__v1, __im)
#undef VEC_EXTRACT_UNIT(__v1, __im)
#undef VEC_SET1_VAL128(__val)
#undef VEC_MOVE(__v1, __val)
#undef VEC_CAST_128_256(__v1)
#undef VEC_INSERT_VAL(__v1, __val, __pos)
#undef VEC_CVT_128_256(__v1)
#undef VEC_SET1_VAL(__val)
#undef VEC_POPCVT_CHAR(__ch)
#undef VEC_LDPOPCVT_CHAR(__addr)
#undef VEC_CMP_EQ(__v1, __v2)
#undef VEC_SET_LSE(__val)
#undef SHIFT_HAP(__v1, __val)
#undef VEC_EXTRACT_UNIT
#undef VEC_INSERT_UNIT
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
#undef MASK_VEC
#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst)
#undef VEC_SHIFT_LEFT_1BIT(__vs)
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
#undef MASK_ALL_ONES
#undef COMPARE_VECS(__v1, __v2)
#undef COMPARE_VECS
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif

View File

@ -37,7 +37,7 @@
/*#define DUMP_TO_SANDBOX 1*/
#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1
/*#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1*/
#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY
//Gets direct access to Java arrays

View File

@ -26,7 +26,6 @@
#include "headers.h"
#include "jni_common.h"
#include "org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h"
#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"
#include "jnidebug.h"

View File

@ -26,7 +26,7 @@
#include "headers.h"
#include "jni_common.h"
#include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h"
#include "template.h"
//#include "template.h"
#include "utils.h"
#include "LoadTimeInitializer.h"

View File

@ -29,7 +29,6 @@ using namespace std;
int main(int argc, char** argv)
{
#define BATCH_SIZE 10000
if(argc < 2)
{
cerr << "Needs path to input file as argument\n";

View File

@ -24,12 +24,11 @@
#include "headers.h"
#include "template.h"
#include "vector_defs.h"
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "utils.h"
#define BATCH_SIZE 10000
#define RUN_HYBRID

View File

@ -23,7 +23,6 @@
*/
#include "template.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
@ -31,6 +30,8 @@
#define SIMD_ENGINE sse
#define SIMD_ENGINE_SSE
#include "template.h"
#include "define-sse-float.h"
#include "shift_template.c"
#include "pairhmm-template-kernel.cc"

View File

@ -28,27 +28,17 @@
#include "headers.h"
#define MM 0
#define GapM 1
#define MX 2
#define XX 3
#define MY 4
#define YY 5
//#define MROWS 500
//#define MCOLS 1000
#define CAT(X,Y) X####Y
#define CONCAT(X,Y) CAT(X,Y)
#define ALIGNED __attribute__((aligned(32)))
#ifdef SIMD_ENGINE_AVX
typedef union __attribute__((aligned(32))) {
ALIGNED __m256 ALIGNED d;
ALIGNED __m128i ALIGNED s[2];
ALIGNED float ALIGNED f[8];
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_F ALIGNED;
#endif
typedef union __attribute__((aligned(32))) {
ALIGNED __m128 ALIGNED d;
@ -81,12 +71,14 @@ typedef union ALIGNED
ALIGNED float ALIGNED f;
} ALIGNED IF_32 ALIGNED;
#ifdef SIMD_ENGINE_AVX
typedef union __attribute__((aligned(32))) {
ALIGNED __m256d ALIGNED d;
ALIGNED __m128i ALIGNED s[2];
ALIGNED double ALIGNED f[4];
ALIGNED __m256i ALIGNED i;
} ALIGNED mix_D ALIGNED;
#endif
typedef union __attribute__((aligned(32))) {
ALIGNED __m128d ALIGNED d;
@ -120,200 +112,7 @@ typedef union ALIGNED
} ALIGNED IF_64 ALIGNED;
#define MAX_QUAL 254
#define MAX_JACOBIAN_TOLERANCE 8.0
#define JACOBIAN_LOG_TABLE_STEP 0.0001
#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP)
#define MAXN 70000
#define LOG10_CACHE_SIZE (4*MAXN) // we need to be able to go up to 2*(2N) when calculating some of the coefficients
#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1)
template<class NUMBER>
struct ContextBase
{
public:
NUMBER ph2pr[128];
NUMBER INITIAL_CONSTANT;
NUMBER LOG10_INITIAL_CONSTANT;
NUMBER RESULT_THRESHOLD;
static bool staticMembersInitializedFlag;
static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
static void initializeStaticMembers()
{
if(!staticMembersInitializedFlag)
{
//Order of calls important - Jacobian first, then MatchToMatch
initializeJacobianLogTable();
initializeMatchToMatchProb();
staticMembersInitializedFlag = true;
}
}
static void deleteStaticMembers()
{
if(staticMembersInitializedFlag)
{
staticMembersInitializedFlag = false;
}
}
//Called only once during library load - don't bother to optimize with single precision fp
static void initializeJacobianLogTable()
{
for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)));
}
}
//Called only once per library load - don't bother optimizing with single fp
static void initializeMatchToMatchProb()
{
double LN10 = log(10);
double INV_LN10 = 1.0/LN10;
for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i)
for (int j = 0; j <= i; j++) {
double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j);
double matchToMatchLog10 =
log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10;
matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10));
}
}
//Called during computation - use single precision where possible
static int fastRound(NUMBER d) {
return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5));
}
//Called during computation - use single precision where possible
static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) {
// make sure small is really the smaller value
if (small > big) {
NUMBER t = big;
big = small;
small = t;
}
if (isinf(small) == -1 || isinf(big) == -1)
return big;
NUMBER diff = big - small;
if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE))
return big;
// OK, so |y-x| < tol: we use the following identity then:
// we need to compute log10(10^x + 10^y)
// By Jacobian logarithm identity, this is equal to
// max(x,y) + log10(1+10^-abs(x-y))
// we compute the second term as a table lookup with integer quantization
// we have pre-stored correction for 0,0.1,0.2,... 10.0
int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding
return big + jacobianLogTable[ind];
}
};
template<class NUMBER>
struct Context : public ContextBase<NUMBER>
{};
template<>
struct Context<double> : public ContextBase<double>
{
Context():ContextBase<double>()
{
for (int x = 0; x < 128; x++)
ph2pr[x] = pow(10.0, -((double)x) / 10.0);
INITIAL_CONSTANT = ldexp(1.0, 1020.0);
LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT);
RESULT_THRESHOLD = 0.0;
}
double LOG10(double v){ return log10(v); }
inline double POW(double b, double e) { return pow(b,e); }
static double _(double n){ return n; }
static double _(float n){ return ((double) n); }
};
template<>
struct Context<float> : public ContextBase<float>
{
Context() : ContextBase<float>()
{
for (int x = 0; x < 128; x++)
{
ph2pr[x] = powf(10.f, -((float)x) / 10.f);
}
INITIAL_CONSTANT = ldexpf(1.f, 120.f);
LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT);
RESULT_THRESHOLD = ldexpf(1.f, -110.f);
}
float LOG10(float v){ return log10f(v); }
inline float POW(float b, float e) { return powf(b,e); }
static float _(double n){ return ((float) n); }
static float _(float n){ return n; }
};
#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual) \
{ \
int minQual = delQual; \
int maxQual = insQual; \
if (insQual <= delQual) \
{ \
minQual = insQual; \
maxQual = delQual; \
} \
(output) = (MAX_QUAL < maxQual) ? \
((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual)) \
: ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; \
}
typedef struct
{
int rslen, haplen;
/*int *q, *i, *d, *c;*/
/*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/
char *q, *i, *d, *c;
char *hap, *rs;
int *ihap;
int *irs;
} testcase;
int normalize(char c);
int read_testcase(testcase *tc, FILE* ifp=0);
#define MIN_ACCEPTED 1e-28f
#define NUM_DISTINCT_CHARS 5
#define AMBIG_CHAR 4
class ConvertChar {
static uint8_t conversionTable[255] ;
public:
static void init() {
assert (NUM_DISTINCT_CHARS == 5) ;
assert (AMBIG_CHAR == 4) ;
conversionTable['A'] = 0 ;
conversionTable['C'] = 1 ;
conversionTable['T'] = 2 ;
conversionTable['G'] = 3 ;
conversionTable['N'] = 4 ;
}
static inline uint8_t get(uint8_t input) {
return conversionTable[input] ;
}
};
#include "common_data_structure.h"
#endif

View File

@ -22,11 +22,8 @@
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "headers.h"
#include "template.h"
#include "utils.h"
#include "vector_defs.h"
#include "LoadTimeInitializer.h"
using namespace std;
@ -36,51 +33,107 @@ uint8_t ConvertChar::conversionTable[255];
float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0;
double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0;
//Static members in ContextBase
template<>
bool ContextBase<double>::staticMembersInitializedFlag = false;
double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
template<>
double ContextBase<double>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
template<>
double ContextBase<double>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
template<>
bool ContextBase<float>::staticMembersInitializedFlag = false;
float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE];
float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1];
template<>
float ContextBase<float>::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE] = { };
template<>
float ContextBase<float>::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1] = { };
bool search_file_for_string(string filename, string search_string)
{
ifstream fptr;
fptr.open(filename.c_str(),ios::in);
if(fptr.is_open())
{
string buffer;
buffer.clear();
buffer.resize(4096);
bool retvalue = false;
while(!fptr.eof())
{
fptr.getline(&(buffer[0]), 4096);
if(buffer.find(search_string) != string::npos) //found string
{
retvalue = true;
break;
}
}
buffer.clear();
fptr.close();
return retvalue;
}
else
return false;
}
bool is_cpuid_ecx_bit_set(int eax, int bitidx)
{
int ecx = 0, edx = 0, ebx = 0;
__asm__ ("cpuid"
:"=b" (ebx),
"=c" (ecx),
"=d" (edx)
:"a" (eax)
);
return (((ecx >> bitidx)&1) == 1);
}
bool is_avx_supported()
{
return (_may_i_use_cpu_feature(_FEATURE_AVX) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 28)&1) == 1;
#ifdef __INTEL_COMPILER
bool use_avx = _may_i_use_cpu_feature(_FEATURE_AVX);
if(use_avx)
return true;
else
{
//check if core supports AVX, but kernel does not and print info message
if(!is_cpuid_ecx_bit_set(1, 28)) //core does not support AVX
return false;
//else fall through to end of function
}
#else
if(!__builtin_cpu_supports("avx")) //core does not support AVX
return false;
else
{
//core supports AVX, check if kernel supports
if(search_file_for_string("/proc/cpuinfo","avx"))
return true;
//else fall through to end of function
}
#endif //__INTEL_COMPILER
clog << "INFO: Your CPU supports AVX vector instructions, but your kernel does not. Try upgrading to a kernel that supports AVX.\n";
clog << "INFO: Your program will run correctly, but slower than the AVX version\n";
return false;
}
bool is_sse41_supported()
{
#ifdef __INTEL_COMPILER
return (_may_i_use_cpu_feature(_FEATURE_SSE4_1) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 19)&1) == 1;
#else
return __builtin_cpu_supports("sse4.1");
#endif
//return is_cpuid_ecx_bit_set(1, 19);
}
bool is_sse42_supported()
{
#ifdef __INTEL_COMPILER
return (_may_i_use_cpu_feature(_FEATURE_SSE4_2) > 0);
//int ecx = 0, edx = 0, ebx = 0;
//__asm__("cpuid"
//: "=b" (ebx),
//"=c" (ecx),
//"=d" (edx)
//: "a" (1)
//);
//return ((ecx >> 20)&1) == 1;
#else
return __builtin_cpu_supports("sse4.2");
#endif
//return is_cpuid_ecx_bit_set(1, 20);
}
uint64_t get_machine_capabilities()
@ -154,10 +207,14 @@ int read_testcase(testcase *tc, FILE* ifp)
tc->haplen = strlen(tc->hap);
tc->rslen = strlen(tc->rs);
assert(strlen(q) == tc->rslen);
assert(strlen(i) == tc->rslen);
assert(strlen(d) == tc->rslen);
assert(strlen(c) == tc->rslen);
assert(strlen(q) == (size_t)tc->rslen);
assert(strlen(i) == (size_t)tc->rslen);
assert(strlen(d) == (size_t)tc->rslen);
assert(strlen(c) == (size_t)tc->rslen);
g_load_time_initializer.update_stat(READ_LENGTH_IDX, tc->rslen);
g_load_time_initializer.update_stat(HAPLOTYPE_LENGTH_IDX, tc->haplen);
g_load_time_initializer.update_stat(PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, tc->haplen*tc->rslen);
//assert(tc->rslen < MROWS);
//tc->ihap = (int *) malloc(tc->haplen*sizeof(int));
//tc->irs = (int *) malloc(tc->rslen*sizeof(int));
@ -279,15 +336,15 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
tc->c = new char[tc->rslen];
//cout << "Lengths "<<tc->haplen <<" "<<tc->rslen<<"\n";
memcpy(tc->rs, tokens[1].c_str(),tokens[1].size());
assert(tokens.size() == 2 + 4*(tc->rslen));
assert(tokens.size() == (size_t)(2 + 4*(tc->rslen)));
//assert(tc->rslen < MROWS);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]);
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]);
if(reformat)
@ -297,16 +354,16 @@ int read_mod_testcase(ifstream& fptr, testcase* tc, bool reformat)
assert(ofptr.is_open());
ofptr << tokens[0] << " ";
ofptr << tokens[1] << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->q[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->i[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->d[j]+33));
ofptr << " ";
for(unsigned j=0;j<tc->rslen;++j)
for(int j=0;j<tc->rslen;++j)
ofptr << ((char)(tc->c[j]+33));
ofptr << " 0 false\n";
@ -363,6 +420,10 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
ifptr.open(filename);
assert(ifptr.is_open());
}
#ifdef PRINT_PER_INTERVAL_TIMINGS
ofstream times_fptr;
times_fptr.open("native_timed_intervals.csv",ios::out);
#endif
vector<testcase> tc_vector;
tc_vector.clear();
testcase tc;
@ -377,11 +438,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
uint32_t no_kernel_mask = (1 << 17); //bit 16 user mode, bit 17 kernel mode
PAPI_num_counters();
int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" };
char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "l1_pending_miss", "lfb_hit", "l2_hit" };
assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK);
assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION", &(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION", &(events[3])) == PAPI_OK);
assert(PAPI_event_name_to_code("L1D_PEND_MISS:OCCURRENCES", &(events[1])) == PAPI_OK);
assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:HIT_LFB", &(events[2])) == PAPI_OK);
assert(PAPI_event_name_to_code("MEM_LOAD_UOPS_RETIRED:L2_HIT", &(events[3])) == PAPI_OK);
long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 };
#endif
@ -398,6 +459,9 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
baseline_results_vec.clear();
results_vec.resize(tc_vector.size());
baseline_results_vec.resize(tc_vector.size());
g_load_time_initializer.update_stat(NUM_TESTCASES_IDX, tc_vector.size());
g_load_time_initializer.update_stat(NUM_READS_IDX, tc_vector.size());
g_load_time_initializer.update_stat(NUM_HAPLOTYPES_IDX, tc_vector.size());
struct timespec start_time;
#ifdef USE_PAPI
assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK);
@ -429,7 +493,11 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
#ifdef USE_PAPI
assert(PAPI_stop_counters(values, NUM_PAPI_COUNTERS) == PAPI_OK);
#endif
vector_compute_time += diff_time(start_time);
uint64_t curr_interval = diff_time(start_time);
#ifdef PRINT_PER_INTERVAL_TIMINGS
times_fptr << curr_interval << "\n";
#endif
vector_compute_time += curr_interval;
#ifdef USE_PAPI
for(unsigned k=0;k<NUM_PAPI_COUNTERS;++k)
accum_values[k] += values[k];
@ -488,9 +556,12 @@ void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size,
for(unsigned i=0;i<NUM_PAPI_COUNTERS;++i)
cout << eventnames[i] << " : "<<accum_values[i]<<"\n";
#endif
#ifdef PRINT_PER_INTERVAL_TIMINGS
times_fptr.close();
#endif
if(use_old_read_testcase)
fclose(fptr);
else
ifptr.close();
g_load_time_initializer.print_profiling();
}

View File

@ -26,7 +26,7 @@
#ifndef PAIRHMM_UTIL_H
#define PAIRHMM_UTIL_H
#include "template.h"
#include "common_data_structure.h"
template<class T>
std::string to_string(T obj)
@ -50,6 +50,15 @@ extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_lo
void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline);
template<class NUMBER>
NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_avxd(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_avxs(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_ssed(testcase *tc, NUMBER *before_last_log=0);
template<class NUMBER>
NUMBER compute_full_prob_sses(testcase *tc, NUMBER *before_last_log=0);
double getCurrClk();
void get_time(struct timespec* x);
uint64_t diff_time(struct timespec& prev_time);
@ -71,5 +80,6 @@ void do_compute(char* filename, bool use_old_read_testcase=true, unsigned chunk_
/*#define DUMP_COMPUTE_VALUES 1*/
#define BATCH_SIZE 10000
#define RUN_HYBRID
/*#define PRINT_PER_INTERVAL_TIMINGS 1*/
#endif

View File

@ -1,55 +0,0 @@
/*Copyright (c) 2012 The Broad Institute
*Permission is hereby granted, free of charge, to any person
*obtaining a copy of this software and associated documentation
*files (the "Software"), to deal in the Software without
*restriction, including without limitation the rights to use,
*copy, modify, merge, publish, distribute, sublicense, and/or sell
*copies of the Software, and to permit persons to whom the
*Software is furnished to do so, subject to the following
*conditions:
*The above copyright notice and this permission notice shall be
*included in all copies or substantial portions of the Software.
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#undef SIMD_ENGINE_SSE
#define SIMD_ENGINE avx
#define SIMD_ENGINE_AVX
#include "define-float.h"
#include "vector_function_prototypes.h"
#include "define-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#define SIMD_ENGINE sse
#define SIMD_ENGINE_SSE
#include "define-sse-float.h"
#include "vector_function_prototypes.h"
#include "define-sse-double.h"
#include "vector_function_prototypes.h"
#undef SIMD_ENGINE
#undef SIMD_ENGINE_AVX
#undef SIMD_ENGINE_SSE

View File

@ -1,44 +0,0 @@
/*Copyright (c) 2012 The Broad Institute
*Permission is hereby granted, free of charge, to any person
*obtaining a copy of this software and associated documentation
*files (the "Software"), to deal in the Software without
*restriction, including without limitation the rights to use,
*copy, modify, merge, publish, distribute, sublicense, and/or sell
*copies of the Software, and to permit persons to whom the
*Software is furnished to do so, subject to the following
*conditions:
*The above copyright notice and this permission notice shall be
*included in all copies or substantial portions of the Software.
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut);
inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn);
inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]);
inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess);
inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt);
inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen);
template<class NUMBER> inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D);
template<class NUMBER> inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)(
int stripIdx, Context<NUMBER> ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY,
SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM ,
SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1,
UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM);
inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256,
SIMD_TYPE distm, SIMD_TYPE _1_distm);
inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y,
UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1,
SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel);
template<class NUMBER> NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL);

View File

@ -78,8 +78,9 @@ public abstract class PairHMM {
protected double[] mLikelihoodArray;
//profiling information
protected static final boolean doProfiling = true;
protected long computeTime = 0;
protected static Boolean doProfiling = true;
protected static long pairHMMComputeTime = 0;
protected long threadLocalPairHMMComputeTimeDiff = 0;
protected long startTime = 0;
/**
@ -207,7 +208,13 @@ public abstract class PairHMM {
}
}
if(doProfiling)
computeTime += (System.nanoTime() - startTime);
{
threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
//synchronized(doProfiling)
{
pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
}
}
return likelihoodMap;
}
@ -314,6 +321,17 @@ public abstract class PairHMM {
return Math.min(haplotype1.length, haplotype2.length);
}
/**
* Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1
* This function should be called only during initialization phase - single thread phase of HC
*/
public static void setNumberOfThreads(final int numThreads)
{
doProfiling = (numThreads == 1);
if(numThreads > 1)
logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n");
}
/**
* Return the results of the computeLikelihoods function
*/
@ -324,6 +342,6 @@ public abstract class PairHMM {
public void close()
{
if(doProfiling)
System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(computeTime*1e-9));
System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9));
}
}