FastDup/ext/htslib/htscodecs/htscodecs/rANS_static.c

851 lines
24 KiB
C

/*
* Copyright (c) 2014-2022 Genome Research Ltd.
* Author(s): James Bonfield
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
* Institute nor the names of its contributors may be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
// Use 11 for order-1?
#define TF_SHIFT 12
#define TOTFREQ (1<<TF_SHIFT)
#include "rANS_byte.h"
#include "utils.h"
/*-------------------------------------------------------------------------- */
/*
* Example wrapper to use the rans_byte.h functions included above.
*
* This demonstrates how to use, and unroll, an order-0 and order-1 frequency
* model.
*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
#include <string.h>
#include <limits.h>
#include <sys/time.h>
#ifndef NO_THREADS
#include <pthread.h>
#endif
#include "rANS_static.h"
#define ABS(a) ((a)>0?(a):-(a))
/*-----------------------------------------------------------------------------
* Memory to memory compression functions.
*
* These are original versions without any manual loop unrolling. They
* are easier to understand, but can be up to 2x slower.
*/
static
unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
unsigned char *cp, *out_end;
RansEncSymbol syms[256];
RansState rans0;
RansState rans2;
RansState rans1;
RansState rans3;
uint8_t* ptr;
int F[256+MAGIC] = {0}, i, j, tab_size, rle, x, fsum = 0;
int m = 0, M = 0;
uint64_t tr;
if (!out_buf)
return NULL;
ptr = out_end = out_buf + (uint32_t)(1.05*in_size) + 257*257*3 + 9;
// Compute statistics
if (hist8(in, in_size, (uint32_t *)F) < 0) {
free(out_buf);
return NULL;
}
tr = in_size ? ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size : 0;
normalise_harder:
// Normalise so T[i] == TOTFREQ
for (fsum = m = M = j = 0; j < 256; j++) {
if (!F[j])
continue;
if (m < F[j])
m = F[j], M = j;
if ((F[j] = (F[j]*tr)>>31) == 0)
F[j] = 1;
fsum += F[j];
}
fsum++;
if (fsum < TOTFREQ) {
F[M] += TOTFREQ-fsum;
} else if (fsum-TOTFREQ > F[M]/2) {
// Corner case to avoid excessive frequency reduction
tr = 2104533975; goto normalise_harder; // equiv to *0.98.
} else {
F[M] -= fsum-TOTFREQ;
}
//printf("F[%d]=%d\n", M, F[M]);
assert(F[M]>0);
// Encode statistics.
cp = out_buf+9;
for (x = rle = j = 0; j < 256; j++) {
if (F[j]) {
// j
if (rle) {
rle--;
} else {
*cp++ = j;
if (!rle && j && F[j-1]) {
for(rle=j+1; rle<256 && F[rle]; rle++)
;
rle -= j+1;
*cp++ = rle;
}
//fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
}
// F[j]
if (F[j]<128) {
*cp++ = F[j];
} else {
*cp++ = 128 | (F[j]>>8);
*cp++ = F[j]&0xff;
}
RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
x += F[j];
}
}
*cp++ = 0;
//write(2, out_buf+4, cp-(out_buf+4));
tab_size = cp-out_buf;
RansEncInit(&rans0);
RansEncInit(&rans1);
RansEncInit(&rans2);
RansEncInit(&rans3);
switch (i=(in_size&3)) {
case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
// fall-through
case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
// fall-through
case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
// fall-through
case 0:
break;
}
for (i=(in_size &~3); likely(i>0); i-=4) {
RansEncSymbol *s3 = &syms[in[i-1]];
RansEncSymbol *s2 = &syms[in[i-2]];
RansEncSymbol *s1 = &syms[in[i-3]];
RansEncSymbol *s0 = &syms[in[i-4]];
RansEncPutSymbol(&rans3, &ptr, s3);
RansEncPutSymbol(&rans2, &ptr, s2);
RansEncPutSymbol(&rans1, &ptr, s1);
RansEncPutSymbol(&rans0, &ptr, s0);
}
RansEncFlush(&rans3, &ptr);
RansEncFlush(&rans2, &ptr);
RansEncFlush(&rans1, &ptr);
RansEncFlush(&rans0, &ptr);
// Finalise block size and return it
*out_size = (out_end - ptr) + tab_size;
cp = out_buf;
*cp++ = 0; // order
*cp++ = ((*out_size-9)>> 0) & 0xff;
*cp++ = ((*out_size-9)>> 8) & 0xff;
*cp++ = ((*out_size-9)>>16) & 0xff;
*cp++ = ((*out_size-9)>>24) & 0xff;
*cp++ = (in_size>> 0) & 0xff;
*cp++ = (in_size>> 8) & 0xff;
*cp++ = (in_size>>16) & 0xff;
*cp++ = (in_size>>24) & 0xff;
memmove(out_buf + tab_size, ptr, out_end-ptr);
return out_buf;
}
typedef struct {
unsigned char R[TOTFREQ];
} ari_decoder;
static
unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
/* Load in the static tables */
unsigned char *cp = in + 9;
unsigned char *cp_end = in + in_size;
const uint32_t mask = (1u << TF_SHIFT)-1;
int i, j, rle;
unsigned int x, y;
unsigned int out_sz, in_sz;
char *out_buf;
RansState R[4];
RansState m[4];
uint16_t sfreq[TOTFREQ+32];
uint16_t ssym [TOTFREQ+32]; // faster, but only needs uint8_t
uint32_t sbase[TOTFREQ+16]; // faster, but only needs uint16_t
if (in_size < 26) // Need at least this many bytes just to start
return NULL;
if (*in++ != 0) // Order-0 check
return NULL;
in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
if (in_sz != in_size-9)
return NULL;
if (out_sz >= INT_MAX)
return NULL; // protect against some overflow cases
// For speeding up the fuzzer only.
// Small input can lead to large uncompressed data.
// We reject this as it just slows things up instead of testing more code
// paths (once we've verified a few times for large data).
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
if (out_sz > 100000)
return NULL;
#endif
out_buf = malloc(out_sz);
if (!out_buf)
return NULL;
//fprintf(stderr, "out_sz=%d\n", out_sz);
// Precompute reverse lookup of frequency.
rle = x = y = 0;
j = *cp++;
do {
int F, C;
if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
if ((F = *cp++) >= 128) {
F &= ~128;
F = ((F & 127) << 8) | *cp++;
}
C = x;
if (x + F > TOTFREQ)
goto cleanup;
for (y = 0; y < F; y++) {
ssym [y + C] = j;
sfreq[y + C] = F;
sbase[y + C] = y;
}
x += F;
if (!rle && j+1 == *cp) {
j = *cp++;
rle = *cp++;
} else if (rle) {
rle--;
j++;
if (j > 255)
goto cleanup;
} else {
j = *cp++;
}
} while(j);
if (x < TOTFREQ-1 || x > TOTFREQ)
goto cleanup;
if (x != TOTFREQ) {
// Protection against accessing uninitialised memory in the case
// where SUM(freqs) == 4095 and not 4096.
ssym [x] = ssym [x-1];
sfreq[x] = sfreq[x-1];
sbase[x] = sbase[x-1]+1;
}
// 16 bytes of cp here. Also why cp - 16 in above loop.
if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto cleanup;
RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto cleanup;
RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto cleanup;
RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto cleanup;
int out_end = (out_sz&~3);
cp_end -= 8; // within 8 for simplicity of loop below
// 2 x likely() here harms gcc 7.5 by about 8% rate drop, but only in O2
for (i=0; likely(i < out_end); i+=4) {
// /curr code
// gcc7 O2 513/497 562/556++ 556/547 ok
// gcc7 O3 566/552 569/553 581/563+
// gcc10 O2 544/538 563/547 541/537-?
// gcc10 O3 531/519 546/530 575/546+
// gcc11 O2 512/490 588/540 540/535 mid
// gcc11 O3 482/471 553/541 549/535
// gcc12 O2 533/526 544/534 539/535
// gcc12 O3 548/533 502/497-- 553/527 ok
// clang10 555/542 564/549 560/541
// clang13 560/553 572/559 556/559
m[0] = R[0] & mask;
R[0] = sfreq[m[0]] * (R[0] >> TF_SHIFT) + sbase[m[0]];
m[1] = R[1] & mask;
R[1] = sfreq[m[1]] * (R[1] >> TF_SHIFT) + sbase[m[1]];
m[2] = R[2] & mask;
R[2] = sfreq[m[2]] * (R[2] >> TF_SHIFT) + sbase[m[2]];
m[3] = R[3] & mask;
R[3] = sfreq[m[3]] * (R[3] >> TF_SHIFT) + sbase[m[3]];
// likely() here harms gcc12 -O3
if (cp<cp_end) {
RansDecRenorm2(&R[0], &R[1], &cp);
RansDecRenorm2(&R[2], &R[3], &cp);
} else {
RansDecRenormSafe(&R[0], &cp, cp_end+8);
RansDecRenormSafe(&R[1], &cp, cp_end+8);
RansDecRenormSafe(&R[2], &cp, cp_end+8);
RansDecRenormSafe(&R[3], &cp, cp_end+8);
}
out_buf[i+0] = ssym[m[0]];
out_buf[i+1] = ssym[m[1]];
out_buf[i+2] = ssym[m[2]];
out_buf[i+3] = ssym[m[3]];
}
switch(out_sz&3) {
case 3:
out_buf[out_end + 2] = ssym[R[2] & mask];
// fall-through
case 2:
out_buf[out_end + 1] = ssym[R[1] & mask];
// fall-through
case 1:
out_buf[out_end] = ssym[R[0] & mask];
// fall-through
default:
break;
}
*out_size = out_sz;
return (unsigned char *)out_buf;
cleanup:
free(out_buf);
return NULL;
}
static
unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
unsigned char *out_buf = NULL, *out_end, *cp;
unsigned int tab_size, rle_i, rle_j;
if (in_size < 4)
return rans_compress_O0(in, in_size, out_size);
int (*F)[256];
RansEncSymbol (*syms)[256];
uint8_t *mem = htscodecs_tls_alloc(256 * (sizeof(*syms) + sizeof(*F)));
if (!mem)
return NULL;
syms = (RansEncSymbol (*)[256])mem;
F = (int (*)[256])(mem + 256*sizeof(*syms));
memset(F, 0, 256*sizeof(*F));
if (!syms) goto cleanup;
int T[256+MAGIC] = {0};
int i, j;
out_buf = malloc(1.05*in_size + 257*257*3 + 9);
if (!out_buf) goto cleanup;
out_end = out_buf + (uint32_t)(1.05*in_size) + 257*257*3 + 9;
cp = out_buf+9;
if (hist1_4(in, in_size, (uint32_t (*)[256])F, (uint32_t *)T) < 0) {
free(out_buf);
out_buf = NULL;
goto cleanup;
}
F[0][in[1*(in_size>>2)]]++;
F[0][in[2*(in_size>>2)]]++;
F[0][in[3*(in_size>>2)]]++;
T[0]+=3;
// Normalise so T[i] == TOTFREQ
for (rle_i = i = 0; i < 256; i++) {
int t2, m, M;
unsigned int x;
if (T[i] == 0)
continue;
//uint64_t p = (TOTFREQ * TOTFREQ) / t;
double p = ((double)TOTFREQ)/T[i];
normalise_harder:
for (t2 = m = M = j = 0; j < 256; j++) {
if (!F[i][j])
continue;
if (m < F[i][j])
m = F[i][j], M = j;
//if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
if ((F[i][j] *= p) == 0)
F[i][j] = 1;
t2 += F[i][j];
}
t2++;
if (t2 < TOTFREQ) {
F[i][M] += TOTFREQ-t2;
} else if (t2-TOTFREQ >= F[i][M]/2) {
// Corner case to avoid excessive frequency reduction
p = .98; goto normalise_harder;
} else {
F[i][M] -= t2-TOTFREQ;
}
// Store frequency table
// i
if (rle_i) {
rle_i--;
} else {
*cp++ = i;
// FIXME: could use order-0 statistics to observe which alphabet
// symbols are present and base RLE on that ordering instead.
if (i && T[i-1]) {
for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
;
rle_i -= i+1;
*cp++ = rle_i;
}
}
int *F_i_ = F[i];
x = 0;
rle_j = 0;
for (j = 0; j < 256; j++) {
if (F_i_[j]) {
//fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
// j
if (rle_j) {
rle_j--;
} else {
*cp++ = j;
if (!rle_j && j && F_i_[j-1]) {
for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
;
rle_j -= j+1;
*cp++ = rle_j;
}
}
// F_i_[j]
if (F_i_[j]<128) {
*cp++ = F_i_[j];
} else {
*cp++ = 128 | (F_i_[j]>>8);
*cp++ = F_i_[j]&0xff;
}
RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
x += F_i_[j];
}
}
*cp++ = 0;
}
*cp++ = 0;
//write(2, out_buf+4, cp-(out_buf+4));
tab_size = cp - out_buf;
assert(tab_size < 257*257*3);
RansState rans0, rans1, rans2, rans3;
RansEncInit(&rans0);
RansEncInit(&rans1);
RansEncInit(&rans2);
RansEncInit(&rans3);
uint8_t* ptr = out_end;
int isz4 = in_size>>2;
int i0 = 1*isz4-2;
int i1 = 2*isz4-2;
int i2 = 3*isz4-2;
int i3 = 4*isz4-2;
unsigned char l0 = in[i0+1];
unsigned char l1 = in[i1+1];
unsigned char l2 = in[i2+1];
unsigned char l3 = in[i3+1];
// Deal with the remainder
l3 = in[in_size-1];
for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
unsigned char c3 = in[i3];
RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
l3 = c3;
}
for (; likely(i0 >= 0); i0--, i1--, i2--, i3--) {
unsigned char c3 = in[i3];
unsigned char c2 = in[i2];
unsigned char c1 = in[i1];
unsigned char c0 = in[i0];
RansEncSymbol *s3 = &syms[c3][l3];
RansEncSymbol *s2 = &syms[c2][l2];
RansEncSymbol *s1 = &syms[c1][l1];
RansEncSymbol *s0 = &syms[c0][l0];
RansEncPutSymbol4(&rans3, &rans2, &rans1, &rans0, &ptr,
s3, s2, s1, s0);
l3 = c3;
l2 = c2;
l1 = c1;
l0 = c0;
}
RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
RansEncFlush(&rans3, &ptr);
RansEncFlush(&rans2, &ptr);
RansEncFlush(&rans1, &ptr);
RansEncFlush(&rans0, &ptr);
*out_size = (out_end - ptr) + tab_size;
cp = out_buf;
*cp++ = 1; // order
*cp++ = ((*out_size-9)>> 0) & 0xff;
*cp++ = ((*out_size-9)>> 8) & 0xff;
*cp++ = ((*out_size-9)>>16) & 0xff;
*cp++ = ((*out_size-9)>>24) & 0xff;
*cp++ = (in_size>> 0) & 0xff;
*cp++ = (in_size>> 8) & 0xff;
*cp++ = (in_size>>16) & 0xff;
*cp++ = (in_size>>24) & 0xff;
memmove(out_buf + tab_size, ptr, out_end-ptr);
cleanup:
htscodecs_tls_free(syms);
return out_buf;
}
static
unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
/* Load in the static tables */
unsigned char *cp = in + 9;
unsigned char *ptr_end = in + in_size;
int i, j = -999, rle_i, rle_j;
unsigned int x;
unsigned int out_sz, in_sz;
char *out_buf = NULL;
// Sanity checking
if (in_size < 27) // Need at least this many bytes to start
return NULL;
if (*in++ != 1) // Order-1 check
return NULL;
in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
if (in_sz != in_size-9)
return NULL;
if (out_sz >= INT_MAX)
return NULL; // protect against some overflow cases
// For speeding up the fuzzer only.
// Small input can lead to large uncompressed data.
// We reject this as it just slows things up instead of testing more code
// paths (once we've verified a few times for large data).
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
if (out_sz > 100000)
return NULL;
#endif
// Allocate decoding lookup tables
RansDecSymbol32 (*syms)[256];
uint8_t *mem = htscodecs_tls_calloc(256, sizeof(ari_decoder)
+ sizeof(*syms));
if (!mem)
return NULL;
ari_decoder *const D = (ari_decoder *)mem;
syms = (RansDecSymbol32 (*)[256])(mem + 256*sizeof(ari_decoder));
int16_t map[256], map_i = 0;
memset(map, -1, 256*sizeof(*map));
if (!D) goto cleanup;
/* These memsets prevent illegal memory access in syms due to
broken compressed data. As D is calloc'd, all illegal transitions
will end up in either row or column 0 of syms. */
memset(&syms[0], 0, sizeof(syms[0]));
for (i = 0; i < 256; i++)
memset(&syms[i][0], 0, sizeof(syms[0][0]));
//fprintf(stderr, "out_sz=%d\n", out_sz);
//i = *cp++;
rle_i = 0;
i = *cp++;
do {
// Map arbitrary a,b,c to 0,1,2 to improve cache locality.
if (map[i] == -1)
map[i] = map_i++;
int m_i = map[i];
rle_j = x = 0;
j = *cp++;
do {
if (map[j] == -1)
map[j] = map_i++;
int F, C;
if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
if ((F = *cp++) >= 128) {
F &= ~128;
F = ((F & 127) << 8) | *cp++;
}
C = x;
//fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C);
if (unlikely(!F))
F = TOTFREQ;
RansDecSymbolInit32(&syms[m_i][j], C, F);
/* Build reverse lookup table */
//if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ);
if (x + F > TOTFREQ)
goto cleanup;
memset(&D[m_i].R[x], j, F);
x += F;
if (!rle_j && j+1 == *cp) {
j = *cp++;
rle_j = *cp++;
} else if (rle_j) {
rle_j--;
j++;
if (j > 255)
goto cleanup;
} else {
j = *cp++;
}
} while(j);
if (x < TOTFREQ-1 || x > TOTFREQ)
goto cleanup;
if (x < TOTFREQ) // historically we fill 4095, not 4096
D[i].R[x] = D[i].R[x-1];
if (!rle_i && i+1 == *cp) {
i = *cp++;
rle_i = *cp++;
} else if (rle_i) {
rle_i--;
i++;
if (i > 255)
goto cleanup;
} else {
i = *cp++;
}
} while (i);
for (i = 0; i < 256; i++)
if (map[i] == -1)
map[i] = 0;
RansState rans0, rans1, rans2, rans3;
uint8_t *ptr = cp;
if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto cleanup;
RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto cleanup;
RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto cleanup;
RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto cleanup;
RansState R[4];
R[0] = rans0;
R[1] = rans1;
R[2] = rans2;
R[3] = rans3;
unsigned int isz4 = out_sz>>2;
uint32_t l0 = 0;
uint32_t l1 = 0;
uint32_t l2 = 0;
uint32_t l3 = 0;
unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
/* Allocate output buffer */
out_buf = malloc(out_sz);
if (!out_buf) goto cleanup;
uint8_t cc0 = D[map[l0]].R[R[0] & ((1u << TF_SHIFT)-1)];
uint8_t cc1 = D[map[l1]].R[R[1] & ((1u << TF_SHIFT)-1)];
uint8_t cc2 = D[map[l2]].R[R[2] & ((1u << TF_SHIFT)-1)];
uint8_t cc3 = D[map[l3]].R[R[3] & ((1u << TF_SHIFT)-1)];
ptr_end -= 8;
for (; likely(i4[0] < isz4); i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
// seq4-head2: file q40b
// O3 O2
// gcc7 296/291 290/260
// gcc10 292/292 290/261
// gcc11 293/293 290/265
// gcc12 293/290 291/266
// clang10 293/290 296/272
// clang13 300/290 290/266
out_buf[i4[0]] = cc0;
out_buf[i4[1]] = cc1;
out_buf[i4[2]] = cc2;
out_buf[i4[3]] = cc3;
RansDecSymbol32 s[4] = {
syms[l0][cc0],
syms[l1][cc1],
syms[l2][cc2],
syms[l3][cc3],
};
RansDecAdvanceStep(&R[0], s[0].start, s[0].freq, TF_SHIFT);
RansDecAdvanceStep(&R[1], s[1].start, s[1].freq, TF_SHIFT);
RansDecAdvanceStep(&R[2], s[2].start, s[2].freq, TF_SHIFT);
RansDecAdvanceStep(&R[3], s[3].start, s[3].freq, TF_SHIFT);
// Likely here helps speed of high-entropy data by 10-11%,
// but harms low entropy-data speed by 3-4%.
if ((ptr < ptr_end)) {
RansDecRenorm2(&R[0], &R[1], &ptr);
RansDecRenorm2(&R[2], &R[3], &ptr);
} else {
RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
}
l0 = map[cc0];
l1 = map[cc1];
l2 = map[cc2];
l3 = map[cc3];
cc0 = D[l0].R[R[0] & ((1u << TF_SHIFT)-1)];
cc1 = D[l1].R[R[1] & ((1u << TF_SHIFT)-1)];
cc2 = D[l2].R[R[2] & ((1u << TF_SHIFT)-1)];
cc3 = D[l3].R[R[3] & ((1u << TF_SHIFT)-1)];
}
// Remainder
for (; i4[3] < out_sz; i4[3]++) {
unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)];
out_buf[i4[3]] = c3;
uint32_t m = R[3] & ((1u << TF_SHIFT)-1);
R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start;
RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
l3 = map[c3];
}
*out_size = out_sz;
cleanup:
htscodecs_tls_free(D);
return (unsigned char *)out_buf;
}
/*-----------------------------------------------------------------------------
* Simple interface to the order-0 vs order-1 encoders and decoders.
*/
unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
unsigned int *out_size, int order) {
if (in_size > INT_MAX) {
*out_size = 0;
return NULL;
}
return order
? rans_compress_O1(in, in_size, out_size)
: rans_compress_O0(in, in_size, out_size);
}
unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
unsigned int *out_size) {
/* Both rans_uncompress functions need to be able to read at least 9
bytes. */
if (in_size < 9)
return NULL;
return in[0]
? rans_uncompress_O1(in, in_size, out_size)
: rans_uncompress_O0(in, in_size, out_size);
}