FastBQSR/ext/htslib/htscodecs/htscodecs/rANS_static32x16pr_avx512.c

1193 lines
42 KiB
C

/*
* Copyright (c) 2017-2023 Genome Research Ltd.
* Author(s): James Bonfield
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
* Institute nor the names of its contributors may be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This is an AVX512 implementation of the 32-way interleaved 16-bit rANS.
* For now it only contains an order-0 implementation. The AVX2 code may
* be used for order-1.
*/
#include "config.h"
#if defined(__x86_64__) && defined(HAVE_AVX512)
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <x86intrin.h>
#include "rANS_word.h"
#include "rANS_static4x16.h"
#define ROT32_SIMD
#include "rANS_static16_int.h"
#include "varint.h"
#include "utils.h"
#ifndef USE_GATHER
// Speds with Downfall mitigation Off/On and Zen4.
// <------ AVX512 ---------> <------- AVX2 -------->
// -o4: IntelOff IntelOn AMDZen4
// gcc7 544/1673 562/1711 448/1818 649/1515 645/1525 875/1624
// gcc13 557/1672 576/1711 582/1761 630/1623 629/1652 866/1762
// clang10 541/1547 564/1630 807/1912 620/1456 637/1481 837/1606
// clang16 533/1431 555/1510 890/1611 629/1370 627/1406 996/1432
//
// Zen4 encode is particularly slow with gcc, but AVX2 encode is
// faster and we use that already.
static inline __m512i _mm512_i32gather_epi32x(__m512i idx, void *v, int size) {
uint32_t *b = (uint32_t *)v;
#ifndef __clang__
volatile
#endif
int c[16] __attribute__((aligned(32)));
//_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
_mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx));
_mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
__m128i x0 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[0]]), b[c[1]], 1);
__m128i x1 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[2]]), b[c[3]], 1);
__m128i x2 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[4]]), b[c[5]], 1);
__m128i x3 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[6]]), b[c[7]], 1);
__m128i x01 = _mm_unpacklo_epi64(x0, x1);
__m128i x23 = _mm_unpacklo_epi64(x2, x3);
__m256i y0 =_mm256_castsi128_si256(x01);
__m128i x4 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[ 8]]), b[c[ 9]], 1);
__m128i x5 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[10]]), b[c[11]], 1);
__m128i x6 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[12]]), b[c[13]], 1);
__m128i x7 = _mm_insert_epi32(_mm_cvtsi32_si128(b[c[14]]), b[c[15]], 1);
__m128i x45 = _mm_unpacklo_epi64(x4, x5);
__m128i x67 = _mm_unpacklo_epi64(x6, x7);
__m256i y1 =_mm256_castsi128_si256(x45);
y0 = _mm256_inserti128_si256(y0, x23, 1);
y1 = _mm256_inserti128_si256(y1, x67, 1);
return _mm512_inserti64x4(_mm512_castsi256_si512(y0), y1, 1);
}
// 32-bit indices, 8-bit quantities into 32-bit lanes
static inline __m512i _mm512_i32gather_epi32x1(__m512i idx, void *v) {
uint8_t *b = (uint8_t *)v;
volatile int c[16] __attribute__((aligned(32)));
//_mm512_store_si512((__m512i *)c, idx); // equivalent, but slower
_mm256_store_si256((__m256i *)(c), _mm512_castsi512_si256(idx));
_mm256_store_si256((__m256i *)(c+8), _mm512_extracti64x4_epi64(idx, 1));
return _mm512_set_epi32(b[c[15]], b[c[14]], b[c[13]], b[c[12]],
b[c[11]], b[c[10]], b[c[9]], b[c[8]],
b[c[7]], b[c[6]], b[c[5]], b[c[4]],
b[c[3]], b[c[2]], b[c[1]], b[c[0]]);
}
#else
// real gathers
#define _mm512_i32gather_epi32x _mm512_i32gather_epi32
#endif
unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
unsigned int in_size,
unsigned char *out,
unsigned int *out_size) {
unsigned char *cp, *out_end;
RansEncSymbol syms[256];
RansState ransN[32] __attribute__((aligned(64)));
uint8_t* ptr;
uint32_t F[256+MAGIC] = {0};
int i, j, tab_size = 0, x, z;
// -20 for order/size/meta
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;
if (!out) {
*out_size = bound;
out = malloc(*out_size);
}
if (!out || bound > *out_size)
return NULL;
// If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
// We already added more round in bound to allow for this.
if (((size_t)out)&1)
bound--;
ptr = out_end = out + bound;
if (in_size == 0)
goto empty;
// Compute statistics
if (hist8(in, in_size, F) < 0)
return NULL;
// Normalise so frequences sum to power of 2
uint32_t fsum = in_size;
uint32_t max_val = round2(fsum);
if (max_val > TOTFREQ)
max_val = TOTFREQ;
if (normalise_freq(F, fsum, max_val) < 0)
return NULL;
fsum=max_val;
cp = out;
cp += encode_freq(cp, F);
tab_size = cp-out;
//write(2, out+4, cp-(out+4));
if (normalise_freq(F, fsum, TOTFREQ) < 0)
return NULL;
// Encode statistics and build lookup tables for SIMD encoding.
uint32_t SB[256], SA[256], SD[256], SC[256];
for (x = j = 0; j < 256; j++) {
if (F[j]) {
RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
SB[j] = syms[j].x_max;
SA[j] = syms[j].rcp_freq;
SD[j] = (syms[j].cmpl_freq<<0) | ((syms[j].rcp_shift-32)<<16);
SC[j] = syms[j].bias;
x += F[j];
}
}
for (z = 0; z < 32; z++)
RansEncInit(&ransN[z]);
z = i = in_size&(32-1);
while (z-- > 0)
RansEncPutSymbol(&ransN[z], &ptr, &syms[in[in_size-(i-z)]]);
#define LOAD512(a,b) \
__m512i a##1 = _mm512_load_si512((__m512i *)&b[0]); \
__m512i a##2 = _mm512_load_si512((__m512i *)&b[16]);
#define STORE512(a,b) \
_mm512_store_si512((__m256i *)&b[0], a##1); \
_mm512_store_si512((__m256i *)&b[16], a##2);
LOAD512(Rv, ransN);
uint16_t *ptr16 = (uint16_t *)ptr;
for (i=(in_size &~(32-1)); i>0; i-=32) {
uint8_t *c = &in[i-32];
// GATHER versions
// Much faster now we have an efficient loadu mechanism in place,
// BUT...
// Try this for avx2 variant too? Better way to populate the mm256
// regs for mix of avx2 and avx512 opcodes.
__m256i c12 = _mm256_loadu_si256((__m256i const *)c);
__m512i c1 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,0));
__m512i c2 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(c12,1));
#define SET512(a,b) \
__m512i a##1 = _mm512_i32gather_epi32x(c1, b, 4); \
__m512i a##2 = _mm512_i32gather_epi32x(c2, b, 4)
SET512(xmax, SB);
uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
int pc1 = _mm_popcnt_u32(gt_mask1);
__m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
__m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
SET512(SDv, SD);
int pc2 = _mm_popcnt_u32(gt_mask2);
Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
_mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
ptr16 -= pc2;
_mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
ptr16 -= pc1;
SET512(rfv, SA);
Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
// interleaved form of this, helps on icc a bit
//rfv1 = _mm512_mulhi_epu32(Rv1, rfv1);
//rfv2 = _mm512_mulhi_epu32(Rv2, rfv2);
// Alternatives here:
// SHIFT right/left instead of AND: (very marginally slower)
// rf1_hm = _mm512_and_epi32(
// rf1_hm, _mm512_set1_epi64((uint64_t)0xffffffff00000000));
// vs
// rf1_hm = _mm512_srli_epi64(rf1_hm, 32);
// rf1_hm = _mm512_slli_epi64(rf1_hm, 32);
__m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32),
_mm512_srli_epi64(rfv1, 32));
__m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32),
_mm512_srli_epi64(rfv2, 32));
__m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
__m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
rf1_hm = _mm512_and_epi32(rf1_hm,
_mm512_set1_epi64((uint64_t)0xffffffff<<32));
rf2_hm = _mm512_and_epi32(rf2_hm,
_mm512_set1_epi64((uint64_t)0xffffffff<<32));
rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
// Or a pure masked blend approach, sadly slower.
// rfv1 = _mm512_mask_blend_epi32(0x5555,
// _mm512_mul_epu32(
// _mm512_srli_epi64(Rv1, 32),
// _mm512_srli_epi64(rfv1, 32)),
// _mm512_srli_epi64(
// _mm512_mul_epu32(Rv1, rfv1),
// 32));
// rfv2 = _mm512_mask_blend_epi32(0xaaaa,
// _mm512_srli_epi64(
// _mm512_mul_epu32(Rv2, rfv2),
// 32),
// _mm512_mul_epu32(
// _mm512_srli_epi64(Rv2, 32),
// _mm512_srli_epi64(rfv2, 32)));
SET512(biasv, SC);
__m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
__m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
__m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
__m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
qv1 = _mm512_mullo_epi32(qv1,
_mm512_and_si512(SDv1,_mm512_set1_epi32(0xffff)));
qv1 = _mm512_add_epi32(qv1, biasv1);
Rv1 = _mm512_add_epi32(Rv1, qv1);
qv2 = _mm512_mullo_epi32(qv2,
_mm512_and_si512(SDv2, _mm512_set1_epi32(0xffff)));
qv2 = _mm512_add_epi32(qv2, biasv2);
Rv2 = _mm512_add_epi32(Rv2, qv2);
}
ptr = (uint8_t *)ptr16;
STORE512(Rv, ransN);
for (z = 32-1; z >= 0; z--)
RansEncFlush(&ransN[z], &ptr);
empty:
// Finalise block size and return it
*out_size = (out_end - ptr) + tab_size;
memmove(out + tab_size, ptr, out_end-ptr);
return out;
}
unsigned char *rans_uncompress_O0_32x16_avx512(unsigned char *in,
unsigned int in_size,
unsigned char *out,
unsigned int out_sz) {
if (in_size < 32*4) // 32-states at least
return NULL;
if (out_sz >= INT_MAX)
return NULL; // protect against some overflow cases
/* Load in the static tables */
unsigned char *cp = in, *out_free = NULL;
unsigned char *cp_end = in + in_size;
int i;
uint32_t s3[TOTFREQ] __attribute__((aligned(64))); // For TF_SHIFT <= 12
if (!out)
out_free = out = malloc(out_sz);
if (!out)
return NULL;
// Precompute reverse lookup of frequency.
uint32_t F[256] = {0}, fsum;
int fsz = decode_freq(cp, cp_end, F, &fsum);
if (!fsz)
goto err;
cp += fsz;
normalise_freq_shift(F, fsum, TOTFREQ);
// Build symbols; fixme, do as part of decode, see the _d variant
if (rans_F_to_s3(F, TF_SHIFT, s3))
goto err;
if (cp_end - cp < 32 * 4)
goto err;
int z;
RansState Rv[32] __attribute__((aligned(64)));
for (z = 0; z < 32; z++) {
RansDecInit(&Rv[z], &cp);
if (Rv[z] < RANS_BYTE_L)
goto err;
}
uint16_t *sp = (uint16_t *)cp;
int out_end = (out_sz&~(32-1));
const uint32_t mask = (1u << TF_SHIFT)-1;
__m512i maskv = _mm512_set1_epi32(mask); // set mask in all lanes
__m512i R1 = _mm512_load_epi32(&Rv[0]);
__m512i R2 = _mm512_load_epi32(&Rv[16]);
// Start of the first loop iteration, which we do move to the end of the
// loop for the next cycle so we can remove some of the instr. latency.
__m512i masked1 = _mm512_and_epi32(R1, maskv);
__m512i masked2 = _mm512_and_epi32(R2, maskv);
__m512i S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
__m512i S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
uint8_t overflow[64+64] = {0};
for (i=0; i < out_end; i+=32) {
//for (z = 0; z < 16; z++) {
// Protect against running off the end of in buffer.
// We copy it to a worst-case local buffer when near the end.
if ((uint8_t *)sp+64 > cp_end) {
memmove(overflow, sp, cp_end - (uint8_t *)sp);
sp = (uint16_t *)overflow;
cp_end = overflow + sizeof(overflow);
}
//uint32_t S = s3[R[z] & mask];
__m512i renorm_words1 = _mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)sp)); // next 16 words
//uint16_t f = S>>(TF_SHIFT+8), b = (S>>8) & mask;
__m512i f1 = _mm512_srli_epi32(S1, TF_SHIFT+8);
__m512i f2 = _mm512_srli_epi32(S2, TF_SHIFT+8);
__m512i b1 = _mm512_and_epi32(_mm512_srli_epi32(S1, 8), maskv);
__m512i b2 = _mm512_and_epi32(_mm512_srli_epi32(S2, 8), maskv);
//R[z] = f * (R[z] >> TF_SHIFT) + b;
// approx 10 cycle latency on mullo.
R1 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(R1, TF_SHIFT), f1), b1);
__mmask16 renorm_mask1, renorm_mask2;
renorm_mask1=_mm512_cmplt_epu32_mask(R1, _mm512_set1_epi32(RANS_BYTE_L));
R2 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(R2, TF_SHIFT), f2), b2);
// renorm. this is the interesting part:
renorm_mask2=_mm512_cmplt_epu32_mask(R2, _mm512_set1_epi32(RANS_BYTE_L));
// advance by however many words we actually read
sp += _mm_popcnt_u32(renorm_mask1);
__m512i renorm_words2 = _mm512_cvtepu16_epi32(_mm256_loadu_si256(
(const __m256i *)sp));
// select masked only
__m512i renorm_vals1, renorm_vals2;
renorm_vals1 = _mm512_mask_expand_epi32(R1, renorm_mask1, renorm_words1);
renorm_vals2 = _mm512_mask_expand_epi32(R2, renorm_mask2, renorm_words2);
// For start of next loop iteration. This has been moved here
// (and duplicated to before the loop starts) so we can do something
// with the latency period of gather, such as finishing up the
// renorm offset and writing the results.
__m512i S1_ = S1; // temporary copy for use in out[]=S later
__m512i S2_ = S2;
masked1 = _mm512_and_epi32(renorm_vals1, maskv);
S1 = _mm512_i32gather_epi32x(masked1, (int *)s3, sizeof(*s3));
masked2 = _mm512_and_epi32(renorm_vals2, maskv);
S2 = _mm512_i32gather_epi32x(masked2, (int *)s3, sizeof(*s3));
R1 = _mm512_mask_slli_epi32(R1, renorm_mask1, R1, 16);
R2 = _mm512_mask_slli_epi32(R2, renorm_mask2, R2, 16);
__m512i m16 = _mm512_set1_epi32(0xffff);
renorm_vals1 = _mm512_maskz_and_epi32(renorm_mask1, renorm_vals1, m16);
renorm_vals2 = _mm512_maskz_and_epi32(renorm_mask2, renorm_vals2, m16);
// advance by however many words we actually read
sp += _mm_popcnt_u32(renorm_mask2);
R1 = _mm512_add_epi32(R1, renorm_vals1);
R2 = _mm512_add_epi32(R2, renorm_vals2);
//out[i+z] = S;
_mm_storeu_si128((__m128i *)(out+i), _mm512_cvtepi32_epi8(S1_));
_mm_storeu_si128((__m128i *)(out+i+16), _mm512_cvtepi32_epi8(S2_));
}
_mm512_store_epi32(&Rv[ 0], R1);
_mm512_store_epi32(&Rv[16], R2);
for (z = out_sz & (32-1); z-- > 0; )
out[out_end + z] = s3[Rv[z] & mask];
return out;
err:
free(out_free);
return NULL;
}
#define TBUF8
#ifdef TBUF8
// 15% quicker overall O1 decode now due to rot32_simd below.
// NB: This uses AVX2 though and we could rewrite using AVX512 for
// further speed gains.
static inline void transpose_and_copy(uint8_t *out, int iN[32],
uint8_t t[32][32]) {
// int z;
// for (z = 0; z < 32; z++) {
// int k;
// for (k = 0; k < 32; k++)
// out[iN[z]+k] = t[k][z];
// iN[z] += 32;
// }
rot32_simd(t, out, iN);
}
#else
// Implemented using AVX512 gathers.
// This is faster than a naive scalar implementation, but doesn't beat the
// AVX2 vectorised 32x32 transpose function.
static inline void transpose_and_copy_avx512(uint8_t *out, int iN[32],
uint32_t t32[32][32]) {
int z;
// for (z = 0; z < 32; z++) {
// int k;
// for (k = 0; k < 32; k++)
// out[iN[z]+k] = t32[k][z];
// iN[z] += 32;
// }
__m512i v1 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0);
v1 = _mm512_slli_epi32(v1, 5);
for (z = 0; z < 32; z++) {
__m512i t1 = _mm512_i32gather_epi32x(v1, &t32[ 0][z], 4);
__m512i t2 = _mm512_i32gather_epi32x(v1, &t32[16][z], 4);
_mm_storeu_si128((__m128i*)(&out[iN[z] ]), _mm512_cvtepi32_epi8(t1));
_mm_storeu_si128((__m128i*)(&out[iN[z]+16]), _mm512_cvtepi32_epi8(t2));
iN[z] += 32;
}
}
#endif // TBUF
unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
unsigned int in_size,
unsigned char *out,
unsigned int *out_size) {
unsigned char *cp, *out_end, *out_free = NULL;
unsigned int tab_size;
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
int z;
RansState ransN[32] __attribute__((aligned(64)));
if (in_size < 32) // force O0 instead
return NULL;
if (!out) {
*out_size = bound;
out = malloc(*out_size);
}
if (!out || bound > *out_size)
return NULL;
if (((size_t)out)&1)
bound--;
out_end = out + bound;
RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms)));
if (!syms) {
free(out_free);
return NULL;
}
cp = out;
int shift = encode_freq1(in, in_size, 32, syms, &cp);
if (shift < 0) {
free(out_free);
htscodecs_tls_free(syms);
return NULL;
}
tab_size = cp - out;
for (z = 0; z < 32; z++)
RansEncInit(&ransN[z]);
uint8_t* ptr = out_end;
int iN[32] __attribute__((aligned(64)));
int isz4 = in_size/32;
for (z = 0; z < 32; z++)
iN[z] = (z+1)*isz4-2;
uint32_t lN[32] __attribute__((aligned(64)));
for (z = 0; z < 32; z++)
lN[z] = in[iN[z]+1];
// Deal with the remainder
z = 32-1;
lN[z] = in[in_size-1];
for (iN[z] = in_size-2; iN[z] > 32*isz4-2; iN[z]--) {
unsigned char c = in[iN[z]];
RansEncPutSymbol(&ransN[z], &ptr, &syms[c][lN[z]]);
lN[z] = c;
}
LOAD512(Rv, ransN);
uint16_t *ptr16 = (uint16_t *)ptr;
LOAD512(iN, iN);
LOAD512(last, lN);
__m512i c1 = _mm512_i32gather_epi32x1(iN1, in);
__m512i c2 = _mm512_i32gather_epi32x1(iN2, in);
// We cache the next 64-bytes locally and transpose.
// This means we can load 32 ints from t32[x] with load instructions
// instead of gathers. The copy, transpose and expand is easier done
// in scalar code.
#define BATCH 64
uint8_t t32[BATCH][32] __attribute__((aligned(64)));
int next_batch;
if (iN[0] > BATCH) {
int i, j;
for (i = 0; i < BATCH; i++)
// memcpy(c[i], &in[iN[i]-32], 32); fast mode
for (j = 0; j < 32; j++)
t32[BATCH-1-i][j] = in[iN[j]-1-i];
next_batch = BATCH;
} else {
next_batch = -1;
}
for (; iN[0] >= 0; iN[0]--) {
// Note, consider doing the same approach for the AVX2 encoder.
// Maybe we can also get gather working well there?
// Gather here is still a major latency bottleneck, consuming
// around 40% of CPU cycles overall.
// FIXME: maybe we need to cope with in[31] read over-flow
// on loop cycles 0, 1, 2 where gather reads 32-bits instead of
// 8 bits. Use set instead there on c2?
// index into syms[0][0] array, used for x_max, rcp_freq, and bias
__m512i vidx1 = _mm512_slli_epi32(c1, 8);
__m512i vidx2 = _mm512_slli_epi32(c2, 8);
vidx1 = _mm512_add_epi32(vidx1, last1);
vidx2 = _mm512_add_epi32(vidx2, last2);
vidx1 = _mm512_slli_epi32(vidx1, 2);
vidx2 = _mm512_slli_epi32(vidx2, 2);
// ------------------------------------------------------------
// for (z = NX-1; z >= 0; z--) {
// if (ransN[z] >= x_max[z]) {
// *--ptr16 = ransN[z] & 0xffff;
// ransN[z] >>= 16;
// }
// }
#define SET512x(a,x) \
__m512i a##1 = _mm512_i32gather_epi32x(vidx1, &syms[0][0].x, 4); \
__m512i a##2 = _mm512_i32gather_epi32x(vidx2, &syms[0][0].x, 4)
// Start of next loop, moved here to remove latency.
// last[z] = c[z]
// iN[z]--
// c[z] = in[iN[z]]
last1 = c1;
last2 = c2;
iN1 = _mm512_sub_epi32(iN1, _mm512_set1_epi32(1));
iN2 = _mm512_sub_epi32(iN2, _mm512_set1_epi32(1));
// Code below is equivalent to this:
// c1 = _mm512_i32gather_epi32(iN1, in, 1);
// c2 = _mm512_i32gather_epi32(iN2, in, 1);
// Better when we have a power of 2
if (next_batch >= 0) {
if (--next_batch < 0 && iN[0] > BATCH) {
// Load 32 BATCH blocks of data.
// Executed once every BATCH cycles
int i, j;
uint8_t c[32][BATCH];
iN[0] += BATCH;
for (j = 0; j < 32; j++) {
iN[j] -= BATCH;
memcpy(c[j], &in[iN[j]-BATCH], BATCH);
}
// transpose matrix from 32xBATCH to BATCHx32
for (j = 0; j < 32; j++) {
for (i = 0; i < BATCH; i+=16) {
int z;
for (z = 0; z < 16; z++)
t32[i+z][j] = c[j][i+z];
}
}
next_batch = BATCH-1;
}
if (next_batch >= 0) {
// Copy from our of our pre-loaded BATCHx32 tables
// Executed every cycles
__m128i c1_ = _mm_load_si128((__m128i *)&t32[next_batch][0]);
__m128i c2_ = _mm_load_si128((__m128i *)&t32[next_batch][16]);
c1 = _mm512_cvtepu8_epi32(c1_);
c2 = _mm512_cvtepu8_epi32(c2_);
}
}
if (next_batch < 0 && iN[0]) {
// no pre-loaded data as within BATCHx32 of input end
c1 = _mm512_i32gather_epi32x1(iN1, in);
c2 = _mm512_i32gather_epi32x1(iN2, in);
// Speeds up clang, even though not needed any more.
// Harmless to leave here.
c1 = _mm512_and_si512(c1, _mm512_set1_epi32(0xff));
c2 = _mm512_and_si512(c2, _mm512_set1_epi32(0xff));
}
// End of "equivalent to" code block
SET512x(xmax, x_max); // high latency
uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
int pc1 = _mm_popcnt_u32(gt_mask1);
__m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
__m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
SET512x(SDv, cmpl_freq); // good
int pc2 = _mm_popcnt_u32(gt_mask2);
Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
_mm512_mask_cvtepi32_storeu_epi16(ptr16-pc2, (1<<pc2)-1, Rp2);
ptr16 -= pc2;
_mm512_mask_cvtepi32_storeu_epi16(ptr16-pc1, (1<<pc1)-1, Rp1);
ptr16 -= pc1;
Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
// ------------------------------------------------------------
// uint32_t q = (uint32_t) (((uint64_t)ransN[z] * rcp_freq[z])
// >> rcp_shift[z]);
// ransN[z] = ransN[z] + bias[z] + q * cmpl_freq[z];
SET512x(rfv, rcp_freq); // good-ish
__m512i rf1_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv1, 32),
_mm512_srli_epi64(rfv1, 32));
__m512i rf2_hm = _mm512_mul_epu32(_mm512_srli_epi64(Rv2, 32),
_mm512_srli_epi64(rfv2, 32));
__m512i rf1_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv1, rfv1), 32);
__m512i rf2_lm = _mm512_srli_epi64(_mm512_mul_epu32(Rv2, rfv2), 32);
const __m512i top32 = _mm512_set1_epi64((uint64_t)0xffffffff00000000);
rf1_hm = _mm512_and_epi32(rf1_hm, top32);
rf2_hm = _mm512_and_epi32(rf2_hm, top32);
rfv1 = _mm512_or_epi32(rf1_lm, rf1_hm);
rfv2 = _mm512_or_epi32(rf2_lm, rf2_hm);
SET512x(biasv, bias); // good
__m512i shiftv1 = _mm512_srli_epi32(SDv1, 16);
__m512i shiftv2 = _mm512_srli_epi32(SDv2, 16);
shiftv1 = _mm512_sub_epi32(shiftv1, _mm512_set1_epi32(32));
shiftv2 = _mm512_sub_epi32(shiftv2, _mm512_set1_epi32(32));
__m512i qv1 = _mm512_srlv_epi32(rfv1, shiftv1);
__m512i qv2 = _mm512_srlv_epi32(rfv2, shiftv2);
const __m512i bot16 = _mm512_set1_epi32(0xffff);
qv1 = _mm512_mullo_epi32(qv1, _mm512_and_si512(SDv1, bot16));
qv2 = _mm512_mullo_epi32(qv2, _mm512_and_si512(SDv2, bot16));
qv1 = _mm512_add_epi32(qv1, biasv1);
Rv1 = _mm512_add_epi32(Rv1, qv1);
qv2 = _mm512_add_epi32(qv2, biasv2);
Rv2 = _mm512_add_epi32(Rv2, qv2);
}
STORE512(Rv, ransN);
STORE512(last, lN);
ptr = (uint8_t *)ptr16;
for (z = 32-1; z>=0; z--)
RansEncPutSymbol(&ransN[z], &ptr, &syms[0][lN[z]]);
for (z = 32-1; z >= 0; z--)
RansEncFlush(&ransN[z], &ptr);
// Finalise block size and return it
*out_size = (out_end - ptr) + tab_size;
// cp = out;
// *cp++ = (in_size>> 0) & 0xff;
// *cp++ = (in_size>> 8) & 0xff;
// *cp++ = (in_size>>16) & 0xff;
// *cp++ = (in_size>>24) & 0xff;
memmove(out + tab_size, ptr, out_end-ptr);
htscodecs_tls_free(syms);
return out;
}
#define NX 32
unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
unsigned int in_size,
unsigned char *out,
unsigned int out_sz) {
if (in_size < NX*4) // 4-states at least
return NULL;
if (out_sz >= INT_MAX)
return NULL; // protect against some overflow cases
/* Load in the static tables */
unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
unsigned char *c_freq = NULL;
uint32_t (*s3)[TOTFREQ_O1] = htscodecs_tls_alloc(256*TOTFREQ_O1*4);
if (!s3)
return NULL;
uint32_t (*s3F)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])s3;
if (!out)
out_free = out = malloc(out_sz);
if (!out)
goto err;
//fprintf(stderr, "out_sz=%d\n", out_sz);
// compressed header? If so uncompress it
unsigned char *tab_end = NULL;
unsigned char *c_freq_end = cp_end;
unsigned int shift = *cp >> 4;
if (*cp++ & 1) {
uint32_t u_freq_sz, c_freq_sz;
cp += var_get_u32(cp, cp_end, &u_freq_sz);
cp += var_get_u32(cp, cp_end, &c_freq_sz);
if (c_freq_sz > cp_end - cp)
goto err;
tab_end = cp + c_freq_sz;
if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL,
u_freq_sz)))
goto err;
cp = c_freq;
c_freq_end = c_freq + u_freq_sz;
}
// Decode order-0 symbol list; avoids needing in order-1 tables
cp += decode_freq1(cp, c_freq_end, shift, s3, s3F, NULL, NULL);
if (tab_end)
cp = tab_end;
free(c_freq);
c_freq = NULL;
if (cp_end - cp < NX * 4)
goto err;
RansState R[NX] __attribute__((aligned(64)));
uint8_t *ptr = cp, *ptr_end = in + in_size;
int z;
for (z = 0; z < NX; z++) {
RansDecInit(&R[z], &ptr);
if (R[z] < RANS_BYTE_L)
goto err;
}
int isz4 = out_sz/NX;
int iN[NX], lN[NX] __attribute__((aligned(64))) = {0};
for (z = 0; z < NX; z++)
iN[z] = z*isz4;
uint16_t *sp = (uint16_t *)ptr;
const uint32_t mask = (1u << shift)-1;
__m512i _maskv = _mm512_set1_epi32(mask);
LOAD512(_Rv, R);
LOAD512(_Lv, lN);
#ifdef TBUF8
union {
unsigned char tbuf[32][32];
uint64_t tbuf64[32][4];
} u __attribute__((aligned(32)));
#else
uint32_t tbuf[32][32];
#endif
unsigned int tidx = 0;
if (shift == TF_SHIFT_O1) {
isz4 -= 64;
for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
// m[z] = R[z] & mask;
__m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
__m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
// S[z] = s3[lN[z]][m[z]];
_Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1);
_Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1);
_masked1 = _mm512_add_epi32(_masked1, _Lv1);
_masked2 = _mm512_add_epi32(_masked2, _Lv2);
// This is the biggest bottleneck
__m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
sizeof(s3F[0][0]));
__m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
sizeof(s3F[0][0]));
// f[z] = S[z]>>(TF_SHIFT_O1+8);
__m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1+8);
__m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1+8);
// b[z] = (S[z]>>8) & mask;
__m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
__m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
// s[z] = S[z] & 0xff;
__m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
__m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
// A maximum frequency of 4096 doesn't fit in our s3 array.
// as it's 12 bit + 12 bit + 8 bit. It wraps around to zero.
// (We don't have this issue for TOTFREQ_O1_FAST.)
//
// Solution 1 is to change to spec to forbid freq of 4096.
// Easy hack is to add an extra symbol so it sums correctly.
// => 572 MB/s on q40 (deskpro).
//
// Solution 2 implemented here is to look for the wrap around
// and fix it.
// => 556 MB/s on q40
// cope with max freq of 4096. Only 3% hit
__m512i max_freq = _mm512_set1_epi32(TOTFREQ_O1);
__m512i zero = _mm512_setzero_si512();
__mmask16 cmp1 = _mm512_cmpeq_epi32_mask(_fv1, zero);
__mmask16 cmp2 = _mm512_cmpeq_epi32_mask(_fv2, zero);
_fv1 = _mm512_mask_blend_epi32(cmp1, _fv1, max_freq);
_fv2 = _mm512_mask_blend_epi32(cmp2, _fv2, max_freq);
// R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
_Rv1 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(_Rv1,TF_SHIFT_O1), _fv1), _bv1);
_Rv2 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(_Rv2,TF_SHIFT_O1), _fv2), _bv2);
//for (z = 0; z < NX; z++) lN[z] = c[z];
_Lv1 = _sv1;
_Lv2 = _sv2;
// RansDecRenorm(&R[z], &ptr);
__m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
_mm512_set1_epi32(0x80000000));
__m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
_mm512_set1_epi32(0x80000000));
int _imask1 =_mm512_cmpgt_epi32_mask
(_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
int _imask2 = _mm512_cmpgt_epi32_mask
(_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
__m512i renorm_words1 = _mm512_cvtepu16_epi32
(_mm256_loadu_si256((const __m256i *)sp));
sp += _mm_popcnt_u32(_imask1);
__m512i renorm_words2 = _mm512_cvtepu16_epi32
(_mm256_loadu_si256((const __m256i *)sp));
sp += _mm_popcnt_u32(_imask2);
__m512i _renorm_vals1 =
_mm512_maskz_expand_epi32(_imask1, renorm_words1);
__m512i _renorm_vals2 =
_mm512_maskz_expand_epi32(_imask2, renorm_words2);
_Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
_Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
_Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
_Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
#ifdef TBUF8
_mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
_mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
_mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
_mm512_cvtepi32_epi8(_Sv2));
#else
_mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
_mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
#endif
iN[0]++;
if (++tidx == 32) {
iN[0]-=32;
// We have tidx[x][y] which we want to store in
// memory in out[y][z] instead. This is an unrolled
// transposition.
#ifdef TBUF8
transpose_and_copy(out, iN, u.tbuf);
#else
transpose_and_copy_avx512(out, iN, tbuf);
#endif
tidx = 0;
}
}
isz4 += 64;
STORE512(_Rv, R);
STORE512(_Lv, lN);
ptr = (uint8_t *)sp;
if (1) {
iN[0]-=tidx;
int T;
for (z = 0; z < NX; z++)
for (T = 0; T < tidx; T++)
#ifdef TBUF8
out[iN[z]++] = u.tbuf[T][z];
#else
out[iN[z]++] = tbuf[T][z];
#endif
}
// Scalar version for close to the end of in[] array so we don't
// do SIMD loads beyond the end of the buffer
for (; iN[0] < isz4;) {
for (z = 0; z < NX; z++) {
uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
uint32_t S = s3[lN[z]][m];
unsigned char c = S & 0xff;
out[iN[z]++] = c;
uint32_t F = S>>(TF_SHIFT_O1+8);
R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
((S>>8) & ((1u<<TF_SHIFT_O1)-1));
RansDecRenormSafe(&R[z], &ptr, ptr_end);
lN[z] = c;
}
}
// Remainder
z = NX-1;
for (; iN[z] < out_sz; ) {
uint32_t m = R[z] & ((1u<<TF_SHIFT_O1)-1);
uint32_t S = s3[lN[z]][m];
unsigned char c = S & 0xff;
out[iN[z]++] = c;
uint32_t F = S>>(TF_SHIFT_O1+8);
R[z] = (F?F:4096) * (R[z]>>TF_SHIFT_O1) +
((S>>8) & ((1u<<TF_SHIFT_O1)-1));
RansDecRenormSafe(&R[z], &ptr, ptr_end);
lN[z] = c;
}
} else {
// TF_SHIFT_O1_FAST. This is the most commonly used variant.
// SIMD version ends decoding early as it reads at most 64 bytes
// from input via 4 vectorised loads.
isz4 -= 64;
for (; iN[0] < isz4 && (uint8_t *)sp+64 < ptr_end; ) {
// m[z] = R[z] & mask;
__m512i _masked1 = _mm512_and_si512(_Rv1, _maskv);
__m512i _masked2 = _mm512_and_si512(_Rv2, _maskv);
// S[z] = s3[lN[z]][m[z]];
_Lv1 = _mm512_slli_epi32(_Lv1, TF_SHIFT_O1_FAST);
_Lv2 = _mm512_slli_epi32(_Lv2, TF_SHIFT_O1_FAST);
_masked1 = _mm512_add_epi32(_masked1, _Lv1);
_masked2 = _mm512_add_epi32(_masked2, _Lv2);
// This is the biggest bottleneck
__m512i _Sv1 = _mm512_i32gather_epi32x(_masked1, (int *)&s3F[0][0],
sizeof(s3F[0][0]));
__m512i _Sv2 = _mm512_i32gather_epi32x(_masked2, (int *)&s3F[0][0],
sizeof(s3F[0][0]));
// f[z] = S[z]>>(TF_SHIFT_O1+8);
__m512i _fv1 = _mm512_srli_epi32(_Sv1, TF_SHIFT_O1_FAST+8);
__m512i _fv2 = _mm512_srli_epi32(_Sv2, TF_SHIFT_O1_FAST+8);
// b[z] = (S[z]>>8) & mask;
__m512i _bv1 = _mm512_and_si512(_mm512_srli_epi32(_Sv1,8), _maskv);
__m512i _bv2 = _mm512_and_si512(_mm512_srli_epi32(_Sv2,8), _maskv);
// s[z] = S[z] & 0xff;
__m512i _sv1 = _mm512_and_si512(_Sv1, _mm512_set1_epi32(0xff));
__m512i _sv2 = _mm512_and_si512(_Sv2, _mm512_set1_epi32(0xff));
// R[z] = f[z] * (R[z] >> TF_SHIFT_O1) + b[z];
_Rv1 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(_Rv1,TF_SHIFT_O1_FAST),
_fv1), _bv1);
_Rv2 = _mm512_add_epi32(
_mm512_mullo_epi32(
_mm512_srli_epi32(_Rv2,TF_SHIFT_O1_FAST),
_fv2), _bv2);
//for (z = 0; z < NX; z++) lN[z] = c[z];
_Lv1 = _sv1;
_Lv2 = _sv2;
// RansDecRenorm(&R[z], &ptr);
__m512i _renorm_mask1 = _mm512_xor_si512(_Rv1,
_mm512_set1_epi32(0x80000000));
__m512i _renorm_mask2 = _mm512_xor_si512(_Rv2,
_mm512_set1_epi32(0x80000000));
int _imask1 =_mm512_cmpgt_epi32_mask
(_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask1);
int _imask2 = _mm512_cmpgt_epi32_mask
(_mm512_set1_epi32(RANS_BYTE_L-0x80000000), _renorm_mask2);
__m512i renorm_words1 = _mm512_cvtepu16_epi32
(_mm256_loadu_si256((const __m256i *)sp));
sp += _mm_popcnt_u32(_imask1);
__m512i renorm_words2 = _mm512_cvtepu16_epi32
(_mm256_loadu_si256((const __m256i *)sp));
sp += _mm_popcnt_u32(_imask2);
__m512i _renorm_vals1 =
_mm512_maskz_expand_epi32(_imask1, renorm_words1);
__m512i _renorm_vals2 =
_mm512_maskz_expand_epi32(_imask2, renorm_words2);
_Rv1 = _mm512_mask_slli_epi32(_Rv1, _imask1, _Rv1, 16);
_Rv2 = _mm512_mask_slli_epi32(_Rv2, _imask2, _Rv2, 16);
_Rv1 = _mm512_add_epi32(_Rv1, _renorm_vals1);
_Rv2 = _mm512_add_epi32(_Rv2, _renorm_vals2);
#ifdef TBUF8
_mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][0]),
_mm512_cvtepi32_epi8(_Sv1)); // or _sv1?
_mm_storeu_si128((__m128i *)(&u.tbuf64[tidx][2]),
_mm512_cvtepi32_epi8(_Sv2));
#else
_mm512_storeu_si512((__m512i *)(&tbuf[tidx][ 0]), _sv1);
_mm512_storeu_si512((__m512i *)(&tbuf[tidx][16]), _sv2);
#endif
iN[0]++;
if (++tidx == 32) {
iN[0]-=32;
#ifdef TBUF8
transpose_and_copy(out, iN, u.tbuf);
#else
transpose_and_copy_avx512(out, iN, tbuf);
#endif
tidx = 0;
}
}
isz4 += 64;
STORE512(_Rv, R);
STORE512(_Lv, lN);
ptr = (uint8_t *)sp;
if (1) {
iN[0]-=tidx;
int T;
for (z = 0; z < NX; z++)
for (T = 0; T < tidx; T++)
#ifdef TBUF8
out[iN[z]++] = u.tbuf[T][z];
#else
out[iN[z]++] = tbuf[T][z];
#endif
}
// Scalar version for close to the end of in[] array so we don't
// do SIMD loads beyond the end of the buffer
for (; iN[0] < isz4;) {
for (z = 0; z < NX; z++) {
uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
uint32_t S = s3F[lN[z]][m];
unsigned char c = S & 0xff;
out[iN[z]++] = c;
R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
RansDecRenormSafe(&R[z], &ptr, ptr_end);
lN[z] = c;
}
}
// Remainder
z = NX-1;
for (; iN[z] < out_sz; ) {
uint32_t m = R[z] & ((1u<<TF_SHIFT_O1_FAST)-1);
uint32_t S = s3F[lN[z]][m];
unsigned char c = S & 0xff;
out[iN[z]++] = c;
R[z] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[z]>>TF_SHIFT_O1_FAST) +
((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
RansDecRenormSafe(&R[z], &ptr, ptr_end);
lN[z] = c;
}
}
htscodecs_tls_free(s3);
return out;
err:
htscodecs_tls_free(s3);
free(out_free);
free(c_freq);
return NULL;
}
#else // HAVE_AVX512
// Prevent "empty translation unit" errors when building without AVX512
const char *rANS_static32x16pr_avx512_disabled = "No AVX512";
#endif // HAVE_AVX512