gatk-3.8/public/VectorPairHMM/src/main/c++/define-sse-float.h

174 lines
4.4 KiB
C
Raw Normal View History

/*Copyright (c) 2012 The Broad Institute
*Permission is hereby granted, free of charge, to any person
*obtaining a copy of this software and associated documentation
*files (the "Software"), to deal in the Software without
*restriction, including without limitation the rights to use,
*copy, modify, merge, publish, distribute, sublicense, and/or sell
*copies of the Software, and to permit persons to whom the
*Software is furnished to do so, subject to the following
*conditions:
*The above copyright notice and this permission notice shall be
*included in all copies or substantial portions of the Software.
*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
*THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifdef PRECISION
2014-01-27 03:36:06 +08:00
#undef PRECISION
#undef MAIN_TYPE
#undef MAIN_TYPE_SIZE
#undef UNION_TYPE
#undef IF_128
#undef IF_MAIN_TYPE
#undef SHIFT_CONST1
#undef SHIFT_CONST2
#undef SHIFT_CONST3
#undef _128_TYPE
#undef SIMD_TYPE
#undef AVX_LENGTH
#undef HAP_TYPE
#undef MASK_TYPE
#undef MASK_ALL_ONES
Parallel version of the JNI for the PairHMM The JNI treats shared memory as critical memory and doesn't allow any parallel reads or writes to it until the native code finishes. This is not a problem *per se* it is the right thing to do, but we need to enable **-nct** when running the haplotype caller and with it have multiple native PairHMM running for each map call. Move to a copy based memory sharing where the JNI simply copies the memory over to C++ and then has no blocked critical memory when running, allowing -nct to work. This version is slightly (almost unnoticeably) slower with -nct 1, but scales better with -nct 2-4 (we haven't tested anything beyond that because we know the GATK falls apart with higher levels of parallelism * Make VECTOR_LOGLESS_CACHING the default implementation for PairHMM. * Changed version number in pom.xml under public/VectorPairHMM * VectorPairHMM can now be compiled using gcc 4.8.x * Modified define-* to get rid of gcc warnings for extra tokens after #undefs * Added a Linux kernel version check for AVX - gcc's __builtin_cpu_supports function does not check whether the kernel supports AVX or not. * Updated PairHMM profiling code to update and print numbers only in single-thread mode * Edited README.md, pom.xml and Makefile for users to pass path to gcc 4.8.x if necessary * Moved all cpuid inline assembly to single function Changed info message to clog from cinfo * Modified version in pom.xml in VectorPairHMM from 3.1 to 3.2 * Deleted some unnecessary code * Modified C++ sandbox to print per interval timing
2014-03-18 02:42:19 +08:00
#undef VEC_EXTRACT_UNIT
#undef VEC_INSERT_UNIT
#undef SET_VEC_ZERO
#undef VEC_OR
#undef VEC_ADD
#undef VEC_SUB
#undef VEC_MUL
#undef VEC_DIV
#undef VEC_BLEND
#undef VEC_BLENDV
#undef VEC_CAST_256_128
#undef VEC_EXTRACT_128
#undef VEC_EXTRACT_UNIT
#undef VEC_SET1_VAL128
#undef VEC_MOVE
#undef VEC_CAST_128_256
#undef VEC_INSERT_VAL
#undef VEC_CVT_128_256
#undef VEC_SET1_VAL
#undef VEC_POPCVT_CHAR
#undef VEC_LDPOPCVT_CHAR
#undef VEC_CMP_EQ
#undef VEC_SET_LSE
#undef SHIFT_HAP
2014-01-27 03:36:06 +08:00
#undef MASK_VEC
Parallel version of the JNI for the PairHMM The JNI treats shared memory as critical memory and doesn't allow any parallel reads or writes to it until the native code finishes. This is not a problem *per se* it is the right thing to do, but we need to enable **-nct** when running the haplotype caller and with it have multiple native PairHMM running for each map call. Move to a copy based memory sharing where the JNI simply copies the memory over to C++ and then has no blocked critical memory when running, allowing -nct to work. This version is slightly (almost unnoticeably) slower with -nct 1, but scales better with -nct 2-4 (we haven't tested anything beyond that because we know the GATK falls apart with higher levels of parallelism * Make VECTOR_LOGLESS_CACHING the default implementation for PairHMM. * Changed version number in pom.xml under public/VectorPairHMM * VectorPairHMM can now be compiled using gcc 4.8.x * Modified define-* to get rid of gcc warnings for extra tokens after #undefs * Added a Linux kernel version check for AVX - gcc's __builtin_cpu_supports function does not check whether the kernel supports AVX or not. * Updated PairHMM profiling code to update and print numbers only in single-thread mode * Edited README.md, pom.xml and Makefile for users to pass path to gcc 4.8.x if necessary * Moved all cpuid inline assembly to single function Changed info message to clog from cinfo * Modified version in pom.xml in VectorPairHMM from 3.1 to 3.2 * Deleted some unnecessary code * Modified C++ sandbox to print per interval timing
2014-03-18 02:42:19 +08:00
#undef VEC_SSE_TO_AVX
#undef VEC_SHIFT_LEFT_1BIT
2014-01-27 03:36:06 +08:00
#undef MASK_ALL_ONES
Parallel version of the JNI for the PairHMM The JNI treats shared memory as critical memory and doesn't allow any parallel reads or writes to it until the native code finishes. This is not a problem *per se* it is the right thing to do, but we need to enable **-nct** when running the haplotype caller and with it have multiple native PairHMM running for each map call. Move to a copy based memory sharing where the JNI simply copies the memory over to C++ and then has no blocked critical memory when running, allowing -nct to work. This version is slightly (almost unnoticeably) slower with -nct 1, but scales better with -nct 2-4 (we haven't tested anything beyond that because we know the GATK falls apart with higher levels of parallelism * Make VECTOR_LOGLESS_CACHING the default implementation for PairHMM. * Changed version number in pom.xml under public/VectorPairHMM * VectorPairHMM can now be compiled using gcc 4.8.x * Modified define-* to get rid of gcc warnings for extra tokens after #undefs * Added a Linux kernel version check for AVX - gcc's __builtin_cpu_supports function does not check whether the kernel supports AVX or not. * Updated PairHMM profiling code to update and print numbers only in single-thread mode * Edited README.md, pom.xml and Makefile for users to pass path to gcc 4.8.x if necessary * Moved all cpuid inline assembly to single function Changed info message to clog from cinfo * Modified version in pom.xml in VectorPairHMM from 3.1 to 3.2 * Deleted some unnecessary code * Modified C++ sandbox to print per interval timing
2014-03-18 02:42:19 +08:00
#undef COMPARE_VECS
2014-01-27 03:36:06 +08:00
#undef _256_INT_TYPE
#undef BITMASK_VEC
#endif
#define SSE
#define PRECISION s
#define MAIN_TYPE float
#define MAIN_TYPE_SIZE 32
#define UNION_TYPE mix_F128
#define IF_128 IF_128f
#define IF_MAIN_TYPE IF_32
#define SHIFT_CONST1 3
#define SHIFT_CONST2 4
#define SHIFT_CONST3 0
#define _128_TYPE __m128
2014-01-27 03:36:06 +08:00
#define SIMD_TYPE __m128
#define _256_INT_TYPE __m128i
#define AVX_LENGTH 4
2014-01-27 03:36:06 +08:00
//#define MAVX_COUNT (MROWS+3)/AVX_LENGTH
#define HAP_TYPE UNION_TYPE
#define MASK_TYPE uint32_t
#define MASK_ALL_ONES 0xFFFFFFFF
#define MASK_VEC MaskVec_F
#define VEC_EXTRACT_UNIT(__v1, __im) \
2014-01-27 03:36:06 +08:00
_mm_extract_epi32(__v1, __im)
2014-01-27 03:36:06 +08:00
#define VEC_INSERT_UNIT(__v1,__ins,__im) \
_mm_insert_epi32(__v1,__ins,__im)
#define VEC_OR(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_or_ps(__v1, __v2)
#define VEC_ADD(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_add_ps(__v1, __v2)
#define VEC_SUB(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_sub_ps(__v1, __v2)
#define VEC_MUL(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_mul_ps(__v1, __v2)
#define VEC_DIV(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_div_ps(__v1, __v2)
#define VEC_CMP_EQ(__v1, __v2) \
2014-01-27 03:36:06 +08:00
_mm_cmpeq_ps(__v1, __v2)
#define VEC_BLEND(__v1, __v2, __mask) \
2014-01-27 03:36:06 +08:00
_mm_blend_ps(__v1, __v2, __mask)
#define VEC_BLENDV(__v1, __v2, __maskV) \
2014-01-27 03:36:06 +08:00
_mm_blendv_ps(__v1, __v2, __maskV)
#define SHIFT_HAP(__v1, __val) \
2014-01-27 03:36:06 +08:00
_vector_shift_lastsses(__v1, __val.f)
#define VEC_CVT_128_256(__v1) \
2014-01-27 03:36:06 +08:00
_mm_cvtepi32_ps(__v1.i)
#define VEC_SET1_VAL(__val) \
_mm_set1_ps(__val)
#define VEC_POPCVT_CHAR(__ch) \
2014-01-27 03:36:06 +08:00
_mm_cvtepi32_ps(_mm_set1_epi32(__ch))
#define VEC_SET_LSE(__val) \
2014-01-27 03:36:06 +08:00
_mm_set_ps(zero, zero, zero, __val);
#define VEC_LDPOPCVT_CHAR(__addr) \
2014-01-27 03:36:06 +08:00
_mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr))
#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \
2014-01-27 03:36:06 +08:00
__vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh)
#define VEC_SHIFT_LEFT_1BIT(__vs) \
2014-01-27 03:36:06 +08:00
__vs = _mm_slli_epi32(__vs, 1)
class BitMaskVec_sse_float {
2014-01-27 03:36:06 +08:00
MASK_VEC combined_ ;
public:
inline MASK_TYPE& getLowEntry(int index) {
return combined_.masks[index] ;
}
inline MASK_TYPE& getHighEntry(int index) {
return combined_.masks[AVX_LENGTH/2+index] ;
}
inline const SIMD_TYPE& getCombinedMask() {
return combined_.vecf ;
}
inline void shift_left_1bit() {
VEC_SHIFT_LEFT_1BIT(combined_.vec) ;
}
} ;
#define BITMASK_VEC BitMaskVec_sse_float