From 3c997ca0166f05e626a826002b10bb8455567884 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Sep 2017 14:29:49 -0400 Subject: [PATCH 1/4] r337: support CPU dispatch for gcc-4.8+ using __builtin_cpu_supports() --- Makefile | 31 ++++++++++++++++++++++++++++--- ksw2.h | 1 - ksw2_dispatch.c | 43 +++++++++++++++++++++++++++++++++++++++++++ ksw2_extd2_sse.c | 10 ++++++++++ ksw2_exts2_sse.c | 10 ++++++++++ ksw2_extz2_sse.c | 8 ++++++++ main.c | 2 +- 7 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 ksw2_dispatch.c diff --git a/Makefile b/Makefile index 6ed7bae..66b37ce 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,20 @@ CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= -DHAVE_KALLOC -INCLUDES= -I. -OBJS= kthread.o kalloc.o ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o ksw2_ll_sse.o \ - misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o +INCLUDES= +OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o index.o chain.o align.o hit.o map.o format.o ksw2_ll_sse.o PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread +ifneq ($(cpu_dispatch),) + OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o +else ifeq ($(sse2only),) CFLAGS+=-msse4 endif + OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o +endif .SUFFIXES:.c .o @@ -33,6 +37,27 @@ libminimap2.a:$(OBJS) sdust:sdust.c getopt.o kalloc.o kalloc.h kdq.h kvec.h kseq.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< getopt.o kalloc.o -o $@ -lz +ksw2_extz2_sse41.o:ksw2_extz2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + +ksw2_extz2_sse2.o:ksw2_extz2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + +ksw2_extd2_sse41.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + +ksw2_extd2_sse2.o:ksw2_extd2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + +ksw2_exts2_sse41.o:ksw2_exts2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + +ksw2_exts2_sse2.o:ksw2_exts2_sse.c ksw2.h kalloc.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + +ksw2_dispatch.o:ksw2_dispatch.c ksw2.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ + clean: rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session* diff --git a/ksw2.h b/ksw2.h index 5e43970..fa22fe6 100644 --- a/ksw2.h +++ b/ksw2.h @@ -169,5 +169,4 @@ static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, } return 0; } - #endif diff --git a/ksw2_dispatch.c b/ksw2_dispatch.c new file mode 100644 index 0000000..681460e --- /dev/null +++ b/ksw2_dispatch.c @@ -0,0 +1,43 @@ +#ifdef KSW_CPU_DISPATCH +#include +#include "ksw2.h" + +void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez) +{ + extern void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); + extern void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); + if (__builtin_cpu_supports("sse4.1")) + ksw_extz2_sse41(km, qlen, query, tlen, target, m, mat, q, e, w, zdrop, flag, ez); + else if (__builtin_cpu_supports("sse2")) + ksw_extz2_sse2(km, qlen, query, tlen, target, m, mat, q, e, w, zdrop, flag, ez); + else abort(); +} + +void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +{ + extern void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); + extern void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); + if (__builtin_cpu_supports("sse4.1")) + ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); + else if (__builtin_cpu_supports("sse2")) + ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); + else abort(); +} + +void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) +{ + extern void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); + extern void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); + if (__builtin_cpu_supports("sse4.1")) + ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); + else if (__builtin_cpu_supports("sse2")) + ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); + else abort(); +} +#endif diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index 56cd8cd..9a64e49 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -10,8 +10,18 @@ #include #endif +#ifdef KSW_CPU_DISPATCH +#ifdef __SSE4_1__ +void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +#else +void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +#endif +#else void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez) +#endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ z = _mm_load_si128(&s[t]); \ diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 6decd2d..7a64905 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -10,8 +10,18 @@ #include #endif +#ifdef KSW_CPU_DISPATCH +#ifdef __SSE4_1__ +void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) +#else +void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) +#endif +#else void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez) +#endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ z = _mm_load_si128(&s[t]); \ diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index f21f184..18a3d2b 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -9,7 +9,15 @@ #include #endif +#ifdef KSW_CPU_DISPATCH +#ifdef __SSE4_1__ +void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez) +#else +void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez) +#endif +#else void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez) +#endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \ diff --git a/main.c b/main.c index 720e743..e838560 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1-r335-dirty" +#define MM_VERSION "2.1-r337-dirty" #ifdef __linux__ #include From 46e8b6a4f95d6776e4c1c20832d9bccd1802b632 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Sep 2017 20:29:24 -0400 Subject: [PATCH 2/4] r338: portable CPU dispatch, which is the default working with gcc, icc, clang and msvc. --- Makefile | 17 +++++++---------- ksw2_dispatch.c | 48 ++++++++++++++++++++++++++++++++++++++++++------ ksw2_extd2_sse.c | 4 ++++ ksw2_exts2_sse.c | 4 ++++ ksw2_extz2_sse.c | 4 ++++ main.c | 2 +- 6 files changed, 62 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 66b37ce..85d5ea5 100644 --- a/Makefile +++ b/Makefile @@ -7,12 +7,9 @@ PROG= minimap2 PROG_EXTRA= sdust minimap2-lite LIBS= -lm -lz -lpthread -ifneq ($(cpu_dispatch),) +ifeq ($(sse2only),) OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o else -ifeq ($(sse2only),) - CFLAGS+=-msse4 -endif OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o endif @@ -38,22 +35,22 @@ sdust:sdust.c getopt.o kalloc.o kalloc.h kdq.h kvec.h kseq.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< getopt.o kalloc.o -o $@ -lz ksw2_extz2_sse41.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + $(CC) -c -msse4 $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ ksw2_extz2_sse2.o:ksw2_extz2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH -DKSW_SSE2_ONLY $(INCLUDES) $< -o $@ ksw2_extd2_sse41.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + $(CC) -c -msse4 $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ ksw2_extd2_sse2.o:ksw2_extd2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH -DKSW_SSE2_ONLY $(INCLUDES) $< -o $@ ksw2_exts2_sse41.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -msse4 $< -o $@ + $(CC) -c -msse4 $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ ksw2_exts2_sse2.o:ksw2_exts2_sse.c ksw2.h kalloc.h - $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) -mno-sse4 -msse2 $< -o $@ + $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH -DKSW_SSE2_ONLY $(INCLUDES) $< -o $@ ksw2_dispatch.o:ksw2_dispatch.c ksw2.h $(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@ diff --git a/ksw2_dispatch.c b/ksw2_dispatch.c index 681460e..99de9a0 100644 --- a/ksw2_dispatch.c +++ b/ksw2_dispatch.c @@ -2,13 +2,45 @@ #include #include "ksw2.h" +#define SIMD_SSE 0x1 +#define SIMD_SSE2 0x2 +#define SIMD_SSE3 0x4 +#define SIMD_SSE4_1 0x8 +#define SIMD_SSE4_2 0x10 +#define SIMD_AVX 0x20 +#define SIMD_AVX2 0x40 +#define SIMD_AVX512F 0x80 + +unsigned x86_simd(void) +{ + unsigned eax, ebx, ecx, edx, flag = 0; +#ifdef _MSC_VER + int cpuid[4]; + __cpuid(cpuid, 1); + eax = cpuid[0], ebx = cpuid[1], ecx = cpuid[2], edx = cpuid[3]; +#else + asm volatile("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (1)); +#endif + if (edx>>25&1) flag |= SIMD_SSE; + if (edx>>26&1) flag |= SIMD_SSE2; + if (ecx>>0 &1) flag |= SIMD_SSE3; + if (ecx>>19&1) flag |= SIMD_SSE4_1; + if (ecx>>20&1) flag |= SIMD_SSE4_2; + if (ecx>>28&1) flag |= SIMD_AVX; + if (ebx>>5 &1) flag |= SIMD_AVX2; + if (ebx>>16&1) flag |= SIMD_AVX512F; + return flag; +} + void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez) { extern void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); extern void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); - if (__builtin_cpu_supports("sse4.1")) + unsigned simd; + simd = x86_simd(); + if (simd & SIMD_SSE4_1) ksw_extz2_sse41(km, qlen, query, tlen, target, m, mat, q, e, w, zdrop, flag, ez); - else if (__builtin_cpu_supports("sse2")) + else if (simd & SIMD_SSE2) ksw_extz2_sse2(km, qlen, query, tlen, target, m, mat, q, e, w, zdrop, flag, ez); else abort(); } @@ -20,9 +52,11 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); extern void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int flag, ksw_extz_t *ez); - if (__builtin_cpu_supports("sse4.1")) + unsigned simd; + simd = x86_simd(); + if (simd & SIMD_SSE4_1) ksw_extd2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); - else if (__builtin_cpu_supports("sse2")) + else if (simd & SIMD_SSE2) ksw_extd2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, e2, w, zdrop, flag, ez); else abort(); } @@ -34,9 +68,11 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); extern void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int flag, ksw_extz_t *ez); - if (__builtin_cpu_supports("sse4.1")) + unsigned simd; + simd = x86_simd(); + if (simd & SIMD_SSE4_1) ksw_exts2_sse41(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); - else if (__builtin_cpu_supports("sse2")) + else if (simd & SIMD_SSE2) ksw_exts2_sse2(km, qlen, query, tlen, target, m, mat, q, e, q2, noncan, zdrop, flag, ez); else abort(); } diff --git a/ksw2_extd2_sse.c b/ksw2_extd2_sse.c index 9a64e49..ca4af4d 100644 --- a/ksw2_extd2_sse.c +++ b/ksw2_extd2_sse.c @@ -6,6 +6,10 @@ #ifdef __SSE2__ #include +#ifdef KSW_SSE2_ONLY +#undef __SSE4_1__ +#endif + #ifdef __SSE4_1__ #include #endif diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 7a64905..66149d2 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -6,6 +6,10 @@ #ifdef __SSE2__ #include +#ifdef KSW_SSE2_ONLY +#undef __SSE4_1__ +#endif + #ifdef __SSE4_1__ #include #endif diff --git a/ksw2_extz2_sse.c b/ksw2_extz2_sse.c index 18a3d2b..3889955 100644 --- a/ksw2_extz2_sse.c +++ b/ksw2_extz2_sse.c @@ -5,6 +5,10 @@ #ifdef __SSE2__ #include +#ifdef KSW_SSE2_ONLY +#undef __SSE4_1__ +#endif + #ifdef __SSE4_1__ #include #endif diff --git a/main.c b/main.c index e838560..03493ab 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1-r337-dirty" +#define MM_VERSION "2.1-r338-dirty" #ifdef __linux__ #include From 8b9f2aaf04ca782e911333efccf3ffd2b708e1fb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Sep 2017 13:10:30 -0400 Subject: [PATCH 3/4] r339: improved SIMD detection old code does not check AVX2 --- ksw2_dispatch.c | 60 ++++++++++++++++++++++++++++++++----------------- main.c | 2 +- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/ksw2_dispatch.c b/ksw2_dispatch.c index 99de9a0..b125392 100644 --- a/ksw2_dispatch.c +++ b/ksw2_dispatch.c @@ -5,30 +5,48 @@ #define SIMD_SSE 0x1 #define SIMD_SSE2 0x2 #define SIMD_SSE3 0x4 -#define SIMD_SSE4_1 0x8 -#define SIMD_SSE4_2 0x10 -#define SIMD_AVX 0x20 -#define SIMD_AVX2 0x40 -#define SIMD_AVX512F 0x80 +#define SIMD_SSSE3 0x8 +#define SIMD_SSE4_1 0x10 +#define SIMD_SSE4_2 0x20 +#define SIMD_AVX 0x40 +#define SIMD_AVX2 0x80 +#define SIMD_AVX512F 0x100 -unsigned x86_simd(void) +#ifndef _MSC_VER +// adapted from https://github.com/01org/linux-sgx/blob/master/common/inc/internal/linux/cpuid_gnu.h +void __cpuidex(int cpuid[4], int func_id, int subfunc_id) { - unsigned eax, ebx, ecx, edx, flag = 0; -#ifdef _MSC_VER - int cpuid[4]; - __cpuid(cpuid, 1); - eax = cpuid[0], ebx = cpuid[1], ecx = cpuid[2], edx = cpuid[3]; -#else - asm volatile("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a" (1)); +#if defined(__x86_64__) + asm volatile ("cpuid" + : "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3]) + : "0" (func_id), "2" (subfunc_id)); +#else // on 32bit, ebx can NOT be used as PIC code + asm volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1" + : "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3]) + : "0" (func_id), "2" (subfunc_id)); #endif - if (edx>>25&1) flag |= SIMD_SSE; - if (edx>>26&1) flag |= SIMD_SSE2; - if (ecx>>0 &1) flag |= SIMD_SSE3; - if (ecx>>19&1) flag |= SIMD_SSE4_1; - if (ecx>>20&1) flag |= SIMD_SSE4_2; - if (ecx>>28&1) flag |= SIMD_AVX; - if (ebx>>5 &1) flag |= SIMD_AVX2; - if (ebx>>16&1) flag |= SIMD_AVX512F; +} +#endif + +int x86_simd(void) +{ + int flag = 0, cpuid[4], max_id; + __cpuidex(cpuid, 0, 0); + max_id = cpuid[0]; + if (max_id == 0) return 0; + __cpuidex(cpuid, 1, 0); + if (cpuid[3]>>25&1) flag |= SIMD_SSE; + if (cpuid[3]>>26&1) flag |= SIMD_SSE2; + if (cpuid[2]>>0 &1) flag |= SIMD_SSE3; + if (cpuid[2]>>9 &1) flag |= SIMD_SSSE3; + if (cpuid[2]>>19&1) flag |= SIMD_SSE4_1; + if (cpuid[2]>>20&1) flag |= SIMD_SSE4_2; + if (cpuid[2]>>28&1) flag |= SIMD_AVX; + if (max_id >= 7) { + __cpuidex(cpuid, 7, 0); + if (cpuid[1]>>5 &1) flag |= SIMD_AVX2; + if (cpuid[1]>>16&1) flag |= SIMD_AVX512F; + } return flag; } diff --git a/main.c b/main.c index 03493ab..2e25625 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1-r338-dirty" +#define MM_VERSION "2.1-r339-dirty" #ifdef __linux__ #include From ef3f7ea2f28423776364ba6a30e182ba317efb6a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Sep 2017 13:46:51 -0400 Subject: [PATCH 4/4] Release minimap2-2.1.1 (r341) --- NEWS.md | 23 +++++++++++++++++++++++ main.c | 2 +- minimap2.1 | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5bc0d76..7b04ab6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,26 @@ +Release 2.1.1-r341 (6 September 2017) +------------------------------------- + +This is a maintenance release that is expected to output identical alignment to +v2.1. Detailed changes include: + + * Support CPU dispatch. By default, minimap2 is compiled with both SSE2 and + SSE4 based implementation of alignment and automatically chooses the right + one at runtime. This avoids unexpected errors on older CPUs (#21). + + * Improved Windows support as is requested by Oxford Nanopore (#19). Minimap2 + now avoids variable-length stacked arrays, eliminates alloca(), ships with + getopt_long() and provides timing functions implemented with Windows APIs. + + * Fixed a potential segmentation fault when specifying -k/-w/-H with + multi-part index (#23). + + * Fixed two memory leaks in example.c + +(2.1.1: 6 September 2017, r341) + + + Release 2.1-r311 (25 August 2017) --------------------------------- diff --git a/main.c b/main.c index 2e25625..90ad92a 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.1-r339-dirty" +#define MM_VERSION "2.1.1-r341" #ifdef __linux__ #include diff --git a/minimap2.1 b/minimap2.1 index 61a0bdb..6cef237 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "25 August 2017" "minimap2-2.1-r311" "Bioinformatics tools" +.TH minimap2 1 "6 September 2017" "minimap2-2.1.1-r341" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences