/* * Copyright (c) 2017-2023 Genome Research Ltd. * Author(s): James Bonfield * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger * Institute nor the names of its contributors may be used to endorse * or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This is an AVX512 implementation of the 32-way interleaved 16-bit rANS. * For now it only contains an order-0 implementation. The AVX2 code may * be used for order-1. */ #include "config.h" #if defined(__x86_64__) && defined(HAVE_AVX512) #include #include #include #include #include #include #include "rANS_word.h" #include "rANS_static4x16.h" #define ROT32_SIMD #include "rANS_static16_int.h" #include "varint.h" #include "utils.h" #ifndef USE_GATHER // Speds with Downfall mitigation Off/On and Zen4. // <------ AVX512 ---------> <------- AVX2 --------> // -o4: IntelOff IntelOn AMDZen4 // gcc7 544/1673 562/1711 448/1818 649/1515 645/1525 875/1624 // gcc13 557/1672 576/1711 582/1761 630/1623 629/1652 866/1762 // clang10 541/1547 564/1630 807/1912 620/1456 637/1481 837/1606 // clang16 533/1431 555/1510 890/1611 629/1370 627/1406 996/1432 // // Zen4 encode is particularly slow with gcc, but AVX2 encode is // faster and we use that already. static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) { uint32_t *b = (uint32_t *)v; #ifndef __clang__ volatile #endif int c[16] __attribute__((aligned(32))); //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower _mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx)); _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1)); __m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1); __m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1); __m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1); __m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1); __m128i x01 = _mm_unpacklo_epi64(x0, x1); __m128i x23 = _mm_unpacklo_epi64(x2, x3); __m256i y0 =_mm256_castsi128_si256(x01); __m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1); __m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1); __m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1); __m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1); __m128i x45 = _mm_unpacklo_epi64(x4, x5); __m128i x67 = _mm_unpacklo_epi64(x6, x7); __m256i y1 =_mm256_castsi128_si256(x45); y0 = _mm256_inserti128_si256(y0, x23, 1); y1 = _mm256_inserti128_si256(y1, x67, 1); return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1); } // 32-bit indices, 8-bit quantities into 32-bit lanes static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) { uint8_t *b = (uint8_t *)v; volatile int c[16] __attribute__((aligned(32))); //_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower _mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx)); _mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1)); return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]], b[c[11]], b[c[10]], b[c[9]], b[c[8]], b[c[7]], b[c[6]], b[c[5]], b[c[4]], b[c[3]], b[c[2]], b[c[1]], b[c[0]]); } #else // real gathers #define _mm512_i32gather_epi32x _mm512_i32gather_epi32 #endif unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in, unsigned int in_size, unsigned char *out, unsigned int *out_size) { unsigned char *cp, *out_end; RansEncSymbol syms[256]; RansState ransN[32] __attribute__((aligned(64))); uint8_t* ptr; uint32_t F[256+MAGIC] = {0}; int i, j, tab_size = 0, x, z; // -20 for order/size/meta uint32_t bound = rans_compress_bound_4x16(in_size,0)-20; if (!out) { *out_size = bound; out = malloc(*out_size); } if (!out || bound > *out_size) return NULL; // If "out" isn't word aligned, tweak out_end/ptr to ensure it is. // We already added more round in bound to allow for this. if (((size_t)out)&1) bound--; ptr = out_end = out + bound; if (in_size == 0) goto empty; // Compute statistics if (hist8(in, in_size, F) < 0) return NULL; // Normalise so frequences sum to power of 2 uint32_t fsum = in_size; uint32_t max_val = round2(fsum); if (max_val > TOTFREQ) max_val = TOTFREQ; if (normalise_freq(F, fsum, max_val) < 0) return NULL; fsum=max_val; cp = out; cp += encode_freq(cp, F); tab_size = cp-out; //write(2, out+4, cp-(out+4)); if (normalise_freq(F, fsum, TOTFREQ) < 0) return NULL; // Encode statistics and build lookup tables for SIMD encoding. uint32_t SB[256], SA[256], SD[256], SC[256]; for (x = j = 0; j < 256; j++) { if (F[j]) { RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); SB[j] = syms[j].x_max; SA[j] = syms[j].rcp_freq; SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16); SC[j] = syms[j].bias; x += F[j]; } } for (z = 0; z < 32; z++) RansEncInit(&ransN[z]); z = i = in_size&(32-1); while (z-- > 0) RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]); #define LOAD512(a,b) \ __m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \ __m512i a##2 = _mm512_load_si512((__m512i *)&b[16]); #define STORE512(a,b) \ _mm512_store_si512((__m256i *)&b[0], a##1); \ _mm512_store_si512((__m256i *)&b[16], a##2); LOAD512(Rv, ransN); uint16_t *ptr16 = (uint16_t *)ptr; for (i=(in_size &~(32-1)); i>0; i-=32) { uint8_t *c = &in[i-32]; // GATHER versions // Much faster now we have an efficient loadu mechanism in place, // BUT... // Try this for avx2 variant too? Better way to populate the mm256 // regs for mix of avx2 and avx512 opcodes. __m256i c12 = _mm256_loadu_si256((__m256i const *)c); __m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0)); __m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1)); #define SET512(a,b) \ __m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \ __m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4) SET512(xmax, SB); uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); int pc1 = _mm_popcnt_u32(gt_mask1); __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); SET512(SDv, SD); int pc2 = _mm_popcnt_u32(gt_mask2); Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<= 0; z--) RansEncFlush(&ransN[z], &ptr); empty: // Finalise block size and return it *out_size = (out_end - ptr) + tab_size; memmove(out + tab_size, ptr, out_end-ptr); return out; } unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in, unsigned int in_size, unsigned char *out, unsigned int out_sz) { if (in_size < 32*4) // 32-states at least return NULL; if (out_sz >= INT_MAX) return NULL; // protect against some overflow cases /* Load in the static tables */ unsigned char *cp = in, *out_free = NULL; unsigned char *cp_end = in + in_size; int i; uint32_t s3[TOTFREQ] __attribute__((aligned(64))); // For TF_SHIFT <= 12 if (!out) out_free = out = malloc(out_sz); if (!out) return NULL; // Precompute reverse lookup of frequency. uint32_t F[256] = {0}, fsum; int fsz = decode_freq(cp, cp_end, F, &fsum); if (!fsz) goto err; cp += fsz; normalise_freq_shift(F, fsum, TOTFREQ); // Build symbols; fixme, do as part of decode, see the _d variant if (rans_F_to_s3(F, TF_SHIFT, s3)) goto err; if (cp_end - cp < 32 * 4) goto err; int z; RansState Rv[32] __attribute__((aligned(64))); for (z = 0; z < 32; z++) { RansDecInit(&Rv[z], &cp); if (Rv[z] < RANS_BYTE_L) goto err; } uint16_t *sp = (uint16_t *)cp; int out_end = (out_sz&~(32-1)); const uint32_t mask = (1u << TF_SHIFT)-1; __m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes __m512i R1 = _mm512_load_epi32(&Rv[0]); __m512i R2 = _mm512_load_epi32(&Rv[16]); // Start of the first loop iteration, which we do move to the end of the // loop for the next cycle so we can remove some of the instr. latency. __m512i masked1 = _mm512_and_epi32(R1, maskv); __m512i masked2 = _mm512_and_epi32(R2, maskv); __m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3)); __m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3)); uint8_t overflow[64+64] = {0}; for (i=0; i < out_end; i+=32) { //for (z = 0; z < 16; z++) { // Protect against running off the end of in buffer. // We copy it to a worst-case local buffer when near the end. if ((uint8_t *)sp+64 > cp_end) { memmove(overflow, sp, cp_end - (uint8_t *)sp); sp = (uint16_t *)overflow; cp_end = overflow + sizeof(overflow); } //uint32_t S = s3[R[z] & mask]; __m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words //uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask; __m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8); __m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8); __m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv); __m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv); //R[z] = f * (R[z] >> TF_SHIFT) + b; // approx 10 cycle latency on mullo. R1 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(R1, TF_SHIFT), f1), b1); __mmask16 renorm_mask1, renorm_mask2; renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L)); R2 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(R2, TF_SHIFT), f2), b2); // renorm. this is the interesting part: renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L)); // advance by however many words we actually read sp += _mm_popcnt_u32(renorm_mask1); __m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256( (const __m256i *)sp)); // select masked only __m512i renorm_vals1, renorm_vals2; renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1); renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2); // For start of next loop iteration. This has been moved here // (and duplicated to before the loop starts) so we can do something // with the latency period of gather, such as finishing up the // renorm offset and writing the results. __m512i S1_ = S1; // temporary copy for use in out[]=S later __m512i S2_ = S2; masked1 = _mm512_and_epi32(renorm_vals1, maskv); S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3)); masked2 = _mm512_and_epi32(renorm_vals2, maskv); S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3)); R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16); R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16); __m512i m16 = _mm512_set1_epi32(0xffff); renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16); renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16); // advance by however many words we actually read sp += _mm_popcnt_u32(renorm_mask2); R1 = _mm512_add_epi32(R1, renorm_vals1); R2 = _mm512_add_epi32(R2, renorm_vals2); //out[i+z] = S; _mm_storeu_si128((__m128i *)(out+i), _mm512_cvtepi32_epi8(S1_)); _mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_)); } _mm512_store_epi32(&Rv[ 0], R1); _mm512_store_epi32(&Rv[16], R2); for (z = out_sz & (32-1); z-- > 0; ) out[out_end + z] = s3[Rv[z] & mask]; return out; err: free(out_free); return NULL; } #define TBUF8 #ifdef TBUF8 // 15% quicker overall O1 decode now due to rot32_simd below. // NB: This uses AVX2 though and we could rewrite using AVX512 for // further speed gains. static inline void transpose_and_copy(uint8_t *out, int iN[32], uint8_t t[32][32]) { // int z; // for (z = 0; z < 32; z++) { // int k; // for (k = 0; k < 32; k++) // out[iN[z]+k] = t[k][z]; // iN[z] += 32; // } rot32_simd(t, out, iN); } #else // Implemented using AVX512 gathers. // This is faster than a naive scalar implementation, but doesn't beat the // AVX2 vectorised 32x32 transpose function. static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32], uint32_t t32[32][32]) { int z; // for (z = 0; z < 32; z++) { // int k; // for (k = 0; k < 32; k++) // out[iN[z]+k] = t32[k][z]; // iN[z] += 32; // } __m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); v1 = _mm512_slli_epi32(v1, 5); for (z = 0; z < 32; z++) { __m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4); __m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4); _mm_storeu_si128((__m128i*)(&out[iN[z] ]), _mm512_cvtepi32_epi8(t1)); _mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2)); iN[z] += 32; } } #endif // TBUF unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in, unsigned int in_size, unsigned char *out, unsigned int *out_size) { unsigned char *cp, *out_end, *out_free = NULL; unsigned int tab_size; uint32_t bound = rans_compress_bound_4x16(in_size,1)-20; int z; RansState ransN[32] __attribute__((aligned(64))); if (in_size < 32) // force O0 instead return NULL; if (!out) { *out_size = bound; out = malloc(*out_size); } if (!out || bound > *out_size) return NULL; if (((size_t)out)&1) bound--; out_end = out + bound; RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms))); if (!syms) { free(out_free); return NULL; } cp = out; int shift = encode_freq1(in, in_size, 32, syms, &cp); if (shift < 0) { free(out_free); htscodecs_tls_free(syms); return NULL; } tab_size = cp - out; for (z = 0; z < 32; z++) RansEncInit(&ransN[z]); uint8_t* ptr = out_end; int iN[32] __attribute__((aligned(64))); int isz4 = in_size/32; for (z = 0; z < 32; z++) iN[z] = (z+1)*isz4-2; uint32_t lN[32] __attribute__((aligned(64))); for (z = 0; z < 32; z++) lN[z] = in[iN[z]+1]; // Deal with the remainder z = 32-1; lN[z] = in[in_size-1]; for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) { unsigned char c = in[iN[z]]; RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]); lN[z] = c; } LOAD512(Rv, ransN); uint16_t *ptr16 = (uint16_t *)ptr; LOAD512(iN, iN); LOAD512(last, lN); __m512i c1 = _mm512_i32gather_epi32x1(iN1, in); __m512i c2 = _mm512_i32gather_epi32x1(iN2, in); // We cache the next 64-bytes locally and transpose. // This means we can load 32 ints from t32[x] with load instructions // instead of gathers. The copy, transpose and expand is easier done // in scalar code. #define BATCH 64 uint8_t t32[BATCH][32] __attribute__((aligned(64))); int next_batch; if (iN[0] > BATCH) { int i, j; for (i = 0; i < BATCH; i++) // memcpy(c[i], &in[iN[i]-32], 32); fast mode for (j = 0; j < 32; j++) t32[BATCH-1-i][j] = in[iN[j]-1-i]; next_batch = BATCH; } else { next_batch = -1; } for (; iN[0] >= 0; iN[0]--) { // Note, consider doing the same approach for the AVX2 encoder. // Maybe we can also get gather working well there? // Gather here is still a major latency bottleneck, consuming // around 40% of CPU cycles overall. // FIXME: maybe we need to cope with in[31] read over-flow // on loop cycles 0, 1, 2 where gather reads 32-bits instead of // 8 bits. Use set instead there on c2? // index into syms[0][0] array, used for x_max, rcp_freq, and bias __m512i vidx1 = _mm512_slli_epi32(c1, 8); __m512i vidx2 = _mm512_slli_epi32(c2, 8); vidx1 = _mm512_add_epi32(vidx1, last1); vidx2 = _mm512_add_epi32(vidx2, last2); vidx1 = _mm512_slli_epi32(vidx1, 2); vidx2 = _mm512_slli_epi32(vidx2, 2); // ------------------------------------------------------------ // for (z = NX-1; z >= 0; z--) { // if (ransN[z] >= x_max[z]) { // *--ptr16 = ransN[z] & 0xffff; // ransN[z] >>= 16; // } // } #define SET512x(a,x) \ __m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \ __m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4) // Start of next loop, moved here to remove latency. // last[z] = c[z] // iN[z]-- // c[z] = in[iN[z]] last1 = c1; last2 = c2; iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1)); iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1)); // Code below is equivalent to this: // c1 = _mm512_i32gather_epi32(iN1, in, 1); // c2 = _mm512_i32gather_epi32(iN2, in, 1); // Better when we have a power of 2 if (next_batch >= 0) { if (--next_batch < 0 && iN[0] > BATCH) { // Load 32 BATCH blocks of data. // Executed once every BATCH cycles int i, j; uint8_t c[32][BATCH]; iN[0] += BATCH; for (j = 0; j < 32; j++) { iN[j] -= BATCH; memcpy(c[j], &in[iN[j]-BATCH], BATCH); } // transpose matrix from 32xBATCH to BATCHx32 for (j = 0; j < 32; j++) { for (i = 0; i < BATCH; i+=16) { int z; for (z = 0; z < 16; z++) t32[i+z][j] = c[j][i+z]; } } next_batch = BATCH-1; } if (next_batch >= 0) { // Copy from our of our pre-loaded BATCHx32 tables // Executed every cycles __m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]); __m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]); c1 = _mm512_cvtepu8_epi32(c1_); c2 = _mm512_cvtepu8_epi32(c2_); } } if (next_batch < 0 && iN[0]) { // no pre-loaded data as within BATCHx32 of input end c1 = _mm512_i32gather_epi32x1(iN1, in); c2 = _mm512_i32gather_epi32x1(iN2, in); // Speeds up clang, even though not needed any more. // Harmless to leave here. c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff)); c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff)); } // End of "equivalent to" code block SET512x(xmax, x_max); // high latency uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1); int pc1 = _mm_popcnt_u32(gt_mask1); __m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff)); __m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff)); uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2); SET512x(SDv, cmpl_freq); // good int pc2 = _mm_popcnt_u32(gt_mask2); Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1); Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2); _mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<> rcp_shift[z]); // ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z]; SET512x(rfv, rcp_freq); // good-ish __m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32), _mm512_srli_epi64(rfv1, 32)); __m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32), _mm512_srli_epi64(rfv2, 32)); __m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32); __m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32); const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000); rf1_hm = _mm512_and_epi32(rf1_hm, top32); rf2_hm = _mm512_and_epi32(rf2_hm, top32); rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm); rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm); SET512x(biasv, bias); // good __m512i shiftv1 = _mm512_srli_epi32(SDv1, 16); __m512i shiftv2 = _mm512_srli_epi32(SDv2, 16); shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32)); shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32)); __m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1); __m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2); const __m512i bot16 = _mm512_set1_epi32(0xffff); qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16)); qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16)); qv1 = _mm512_add_epi32(qv1, biasv1); Rv1 = _mm512_add_epi32(Rv1, qv1); qv2 = _mm512_add_epi32(qv2, biasv2); Rv2 = _mm512_add_epi32(Rv2, qv2); } STORE512(Rv, ransN); STORE512(last, lN); ptr = (uint8_t *)ptr16; for (z = 32-1; z>=0; z--) RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]); for (z = 32-1; z >= 0; z--) RansEncFlush(&ransN[z], &ptr); // Finalise block size and return it *out_size = (out_end - ptr) + tab_size; // cp = out; // *cp++ = (in_size>> 0) & 0xff; // *cp++ = (in_size>> 8) & 0xff; // *cp++ = (in_size>>16) & 0xff; // *cp++ = (in_size>>24) & 0xff; memmove(out + tab_size, ptr, out_end-ptr); htscodecs_tls_free(syms); return out; } #define NX 32 unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in, unsigned int in_size, unsigned char *out, unsigned int out_sz) { if (in_size < NX*4) // 4-states at least return NULL; if (out_sz >= INT_MAX) return NULL; // protect against some overflow cases /* Load in the static tables */ unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL; unsigned char *c_freq = NULL; uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4); if (!s3) return NULL; uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3; if (!out) out_free = out = malloc(out_sz); if (!out) goto err; //fprintf(stderr, "out_sz=%d\n", out_sz); // compressed header? If so uncompress it unsigned char *tab_end = NULL; unsigned char *c_freq_end = cp_end; unsigned int shift = *cp >> 4; if (*cp++ & 1) { uint32_t u_freq_sz, c_freq_sz; cp += var_get_u32(cp, cp_end, &u_freq_sz); cp += var_get_u32(cp, cp_end, &c_freq_sz); if (c_freq_sz > cp_end - cp) goto err; tab_end = cp + c_freq_sz; if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz))) goto err; cp = c_freq; c_freq_end = c_freq + u_freq_sz; } // Decode order-0 symbol list; avoids needing in order-1 tables cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL); if (tab_end) cp = tab_end; free(c_freq); c_freq = NULL; if (cp_end - cp < NX * 4) goto err; RansState R[NX] __attribute__((aligned(64))); uint8_t *ptr = cp, *ptr_end = in + in_size; int z; for (z = 0; z < NX; z++) { RansDecInit(&R[z], &ptr); if (R[z] < RANS_BYTE_L) goto err; } int isz4 = out_sz/NX; int iN[NX], lN[NX] __attribute__((aligned(64))) = {0}; for (z = 0; z < NX; z++) iN[z] = z*isz4; uint16_t *sp = (uint16_t *)ptr; const uint32_t mask = (1u << shift)-1; __m512i _maskv = _mm512_set1_epi32(mask); LOAD512(_Rv, R); LOAD512(_Lv, lN); #ifdef TBUF8 union { unsigned char tbuf[32][32]; uint64_t tbuf64[32][4]; } u __attribute__((aligned(32))); #else uint32_t tbuf[32][32]; #endif unsigned int tidx = 0; if (shift == TF_SHIFT_O1) { isz4 -= 64; for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) { // m[z] = R[z] & mask; __m512i _masked1 = _mm512_and_si512(_Rv1, _maskv); __m512i _masked2 = _mm512_and_si512(_Rv2, _maskv); // S[z] = s3[lN[z]][m[z]]; _Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1); _Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1); _masked1 = _mm512_add_epi32(_masked1, _Lv1); _masked2 = _mm512_add_epi32(_masked2, _Lv2); // This is the biggest bottleneck __m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0], sizeof(s3F[0][0])); __m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0], sizeof(s3F[0][0])); // f[z] = S[z]>>(TF_SHIFT_O1+8); __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8); __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8); // b[z] = (S[z]>>8) & mask; __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); // s[z] = S[z] & 0xff; __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); // A maximum frequency of 4096 doesn't fit in our s3 array. // as it's 12 bit + 12 bit + 8 bit. It wraps around to zero. // (We don't have this issue for TOTFREQ_O1_FAST.) // // Solution 1 is to change to spec to forbid freq of 4096. // Easy hack is to add an extra symbol so it sums correctly. // => 572 MB/s on q40 (deskpro). // // Solution 2 implemented here is to look for the wrap around // and fix it. // => 556 MB/s on q40 // cope with max freq of 4096. Only 3% hit __m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1); __m512i zero = _mm512_setzero_si512(); __mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero); __mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero); _fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq); _fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq); // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; _Rv1 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1); _Rv2 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2); //for (z = 0; z < NX; z++) lN[z] = c[z]; _Lv1 = _sv1; _Lv2 = _sv2; // RansDecRenorm(&R[z], &ptr); __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, _mm512_set1_epi32(0x80000000)); __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, _mm512_set1_epi32(0x80000000)); int _imask1 =_mm512_cmpgt_epi32_mask (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); int _imask2 = _mm512_cmpgt_epi32_mask (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); __m512i renorm_words1 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); sp += _mm_popcnt_u32(_imask1); __m512i renorm_words2 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); sp += _mm_popcnt_u32(_imask2); __m512i _renorm_vals1 = _mm512_maskz_expand_epi32(_imask1, renorm_words1); __m512i _renorm_vals2 = _mm512_maskz_expand_epi32(_imask2, renorm_words2); _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); #ifdef TBUF8 _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), _mm512_cvtepi32_epi8(_Sv2)); #else _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); #endif iN[0]++; if (++tidx == 32) { iN[0]-=32; // We have tidx[x][y] which we want to store in // memory in out[y][z] instead. This is an unrolled // transposition. #ifdef TBUF8 transpose_and_copy(out, iN, u.tbuf); #else transpose_and_copy_avx512(out, iN, tbuf); #endif tidx = 0; } } isz4 += 64; STORE512(_Rv, R); STORE512(_Lv, lN); ptr = (uint8_t *)sp; if (1) { iN[0]-=tidx; int T; for (z = 0; z < NX; z++) for (T = 0; T < tidx; T++) #ifdef TBUF8 out[iN[z]++] = u.tbuf[T][z]; #else out[iN[z]++] = tbuf[T][z]; #endif } // Scalar version for close to the end of in[] array so we don't // do SIMD loads beyond the end of the buffer for (; iN[0] < isz4;) { for (z = 0; z < NX; z++) { uint32_t m = R[z] & ((1u<>(TF_SHIFT_O1+8); R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + ((S>>8) & ((1u<>(TF_SHIFT_O1+8); R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) + ((S>>8) & ((1u<>(TF_SHIFT_O1+8); __m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8); __m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8); // b[z] = (S[z]>>8) & mask; __m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv); __m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv); // s[z] = S[z] & 0xff; __m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff)); __m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff)); // R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z]; _Rv1 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST), _fv1), _bv1); _Rv2 = _mm512_add_epi32( _mm512_mullo_epi32( _mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST), _fv2), _bv2); //for (z = 0; z < NX; z++) lN[z] = c[z]; _Lv1 = _sv1; _Lv2 = _sv2; // RansDecRenorm(&R[z], &ptr); __m512i _renorm_mask1 = _mm512_xor_si512(_Rv1, _mm512_set1_epi32(0x80000000)); __m512i _renorm_mask2 = _mm512_xor_si512(_Rv2, _mm512_set1_epi32(0x80000000)); int _imask1 =_mm512_cmpgt_epi32_mask (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1); int _imask2 = _mm512_cmpgt_epi32_mask (_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2); __m512i renorm_words1 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); sp += _mm_popcnt_u32(_imask1); __m512i renorm_words2 = _mm512_cvtepu16_epi32 (_mm256_loadu_si256((const __m256i *)sp)); sp += _mm_popcnt_u32(_imask2); __m512i _renorm_vals1 = _mm512_maskz_expand_epi32(_imask1, renorm_words1); __m512i _renorm_vals2 = _mm512_maskz_expand_epi32(_imask2, renorm_words2); _Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16); _Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16); _Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1); _Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2); #ifdef TBUF8 _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]), _mm512_cvtepi32_epi8(_Sv1)); // or _sv1? _mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]), _mm512_cvtepi32_epi8(_Sv2)); #else _mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1); _mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2); #endif iN[0]++; if (++tidx == 32) { iN[0]-=32; #ifdef TBUF8 transpose_and_copy(out, iN, u.tbuf); #else transpose_and_copy_avx512(out, iN, tbuf); #endif tidx = 0; } } isz4 += 64; STORE512(_Rv, R); STORE512(_Lv, lN); ptr = (uint8_t *)sp; if (1) { iN[0]-=tidx; int T; for (z = 0; z < NX; z++) for (T = 0; T < tidx; T++) #ifdef TBUF8 out[iN[z]++] = u.tbuf[T][z]; #else out[iN[z]++] = tbuf[T][z]; #endif } // Scalar version for close to the end of in[] array so we don't // do SIMD loads beyond the end of the buffer for (; iN[0] < isz4;) { for (z = 0; z < NX; z++) { uint32_t m = R[z] & ((1u<>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + ((S>>8) & ((1u<>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) + ((S>>8) & ((1u<