1729 lines
53 KiB
C
1729 lines
53 KiB
C
/*
|
|
* Copyright (c) 2017-2023 Genome Research Ltd.
|
|
* Author(s): James Bonfield
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials provided
|
|
* with the distribution.
|
|
*
|
|
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
* Institute nor the names of its contributors may be used to endorse
|
|
* or promote products derived from this software without specific
|
|
* prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
|
|
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
|
|
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
// FIXME Can we get decoder to return the compressed sized read, avoiding
|
|
// us needing to store it? Yes we can. See c-size comments. If we added all these
|
|
// together we could get rans_uncompress_to_4x16 to return the number of bytes
|
|
// consumed, avoiding the calling code from needed to explicitly stored the size.
|
|
// However the effect on name tokeniser is to save 0.1 to 0.2% so not worth it.
|
|
|
|
/*-------------------------------------------------------------------------- */
|
|
/*
|
|
* Example wrapper to use the rans_byte.h functions included above.
|
|
*
|
|
* This demonstrates how to use, and unroll, an order-0 and order-1 frequency
|
|
* model.
|
|
*/
|
|
|
|
#include "config.h"
|
|
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <sys/time.h>
|
|
#include <limits.h>
|
|
#include <math.h>
|
|
|
|
#ifndef NO_THREADS
|
|
#include <pthread.h>
|
|
#endif
|
|
|
|
#include "rANS_word.h"
|
|
#include "rANS_static4x16.h"
|
|
#include "rANS_static16_int.h"
|
|
#include "pack.h"
|
|
#include "rle.h"
|
|
#include "utils.h"
|
|
|
|
#define TF_SHIFT 12
|
|
#define TOTFREQ (1<<TF_SHIFT)
|
|
|
|
// 9-11 is considerably faster in the O1 variant due to reduced table size.
|
|
// We auto-tune between 10 and 12 though. Anywhere from 9 to 14 are viable.
|
|
#ifndef TF_SHIFT_O1
|
|
#define TF_SHIFT_O1 12
|
|
#endif
|
|
#ifndef TF_SHIFT_O1_FAST
|
|
#define TF_SHIFT_O1_FAST 10
|
|
#endif
|
|
#define TOTFREQ_O1 (1<<TF_SHIFT_O1)
|
|
#define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST)
|
|
|
|
|
|
/*-----------------------------------------------------------------------------
|
|
* Memory to memory compression functions.
|
|
*
|
|
* These are original versions without any manual loop unrolling. They
|
|
* are easier to understand, but can be up to 2x slower.
|
|
*/
|
|
|
|
unsigned int rans_compress_bound_4x16(unsigned int size, int order) {
|
|
int N = (order>>8) & 0xff;
|
|
if (!N) N=4;
|
|
|
|
order &= 0xff;
|
|
unsigned int sz = (order == 0
|
|
? 1.05*size + 257*3 + 4
|
|
: 1.05*size + 257*257*3 + 4 + 257*3+4) +
|
|
((order & RANS_ORDER_PACK) ? 1 : 0) +
|
|
((order & RANS_ORDER_RLE) ? 1 + 257*3+4: 0) + 20 +
|
|
((order & RANS_ORDER_X32) ? (32-4)*4 : 0) +
|
|
((order & RANS_ORDER_STRIPE) ? 7 + 5*N: 0);
|
|
return sz + (sz&1) + 2; // make this even so buffers are word aligned
|
|
}
|
|
|
|
// Compresses in_size bytes from 'in' to *out_size bytes in 'out'.
|
|
//
|
|
// NB: The output buffer does not hold the original size, so it is up to
|
|
// the caller to store this.
|
|
unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out, unsigned int *out_size) {
|
|
unsigned char *cp, *out_end;
|
|
RansEncSymbol syms[256];
|
|
RansState rans0;
|
|
RansState rans2;
|
|
RansState rans1;
|
|
RansState rans3;
|
|
uint8_t* ptr;
|
|
uint32_t F[256+MAGIC] = {0};
|
|
int i, j, tab_size = 0, rle, x;
|
|
// -20 for order/size/meta
|
|
uint32_t bound = rans_compress_bound_4x16(in_size,0)-20;
|
|
|
|
if (!out) {
|
|
*out_size = bound;
|
|
out = malloc(*out_size);
|
|
}
|
|
if (!out || bound > *out_size)
|
|
return NULL;
|
|
|
|
// If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
|
|
// We already added more round in bound to allow for this.
|
|
if (((size_t)out)&1)
|
|
bound--;
|
|
ptr = out_end = out + bound;
|
|
|
|
if (in_size == 0)
|
|
goto empty;
|
|
|
|
// Compute statistics
|
|
if (hist8(in, in_size, F) < 0)
|
|
return NULL;
|
|
|
|
// Normalise so frequences sum to power of 2
|
|
uint32_t fsum = in_size;
|
|
uint32_t max_val = round2(fsum);
|
|
if (max_val > TOTFREQ)
|
|
max_val = TOTFREQ;
|
|
|
|
if (normalise_freq(F, fsum, max_val) < 0)
|
|
return NULL;
|
|
fsum=max_val;
|
|
|
|
cp = out;
|
|
cp += encode_freq(cp, F);
|
|
tab_size = cp-out;
|
|
//write(2, out+4, cp-(out+4));
|
|
|
|
if (normalise_freq(F, fsum, TOTFREQ) < 0)
|
|
return NULL;
|
|
|
|
// Encode statistics.
|
|
for (x = rle = j = 0; j < 256; j++) {
|
|
if (F[j]) {
|
|
RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
|
|
x += F[j];
|
|
}
|
|
}
|
|
|
|
RansEncInit(&rans0);
|
|
RansEncInit(&rans1);
|
|
RansEncInit(&rans2);
|
|
RansEncInit(&rans3);
|
|
|
|
switch (i=(in_size&3)) {
|
|
case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
|
|
// fall-through
|
|
case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
|
|
// fall-through
|
|
case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
|
|
// fall-through
|
|
case 0:
|
|
break;
|
|
}
|
|
for (i=(in_size &~3); i>0; i-=4) {
|
|
RansEncSymbol *s3 = &syms[in[i-1]];
|
|
RansEncSymbol *s2 = &syms[in[i-2]];
|
|
RansEncSymbol *s1 = &syms[in[i-3]];
|
|
RansEncSymbol *s0 = &syms[in[i-4]];
|
|
|
|
#if 1
|
|
RansEncPutSymbol(&rans3, &ptr, s3);
|
|
RansEncPutSymbol(&rans2, &ptr, s2);
|
|
RansEncPutSymbol(&rans1, &ptr, s1);
|
|
RansEncPutSymbol(&rans0, &ptr, s0);
|
|
#else
|
|
// Slightly beter on gcc, much better on clang
|
|
uint16_t *ptr16 = (uint16_t *)ptr;
|
|
|
|
if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
|
|
if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
|
|
uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
|
|
uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
|
|
rans3 += s3->bias + q3 * s3->cmpl_freq;
|
|
rans2 += s2->bias + q2 * s2->cmpl_freq;
|
|
|
|
if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
|
|
if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
|
|
uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
|
|
uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
|
|
rans1 += s1->bias + q1 * s1->cmpl_freq;
|
|
rans0 += s0->bias + q0 * s0->cmpl_freq;
|
|
|
|
ptr = (uint8_t *)ptr16;
|
|
#endif
|
|
}
|
|
|
|
RansEncFlush(&rans3, &ptr);
|
|
RansEncFlush(&rans2, &ptr);
|
|
RansEncFlush(&rans1, &ptr);
|
|
RansEncFlush(&rans0, &ptr);
|
|
|
|
empty:
|
|
// Finalise block size and return it
|
|
*out_size = (out_end - ptr) + tab_size;
|
|
|
|
memmove(out + tab_size, ptr, out_end-ptr);
|
|
|
|
return out;
|
|
}
|
|
|
|
unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out, unsigned int out_sz) {
|
|
if (in_size < 16) // 4-states at least
|
|
return NULL;
|
|
|
|
if (out_sz >= INT_MAX)
|
|
return NULL; // protect against some overflow cases
|
|
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (out_sz > 100000)
|
|
return NULL;
|
|
#endif
|
|
|
|
/* Load in the static tables */
|
|
unsigned char *cp = in, *out_free = NULL;
|
|
unsigned char *cp_end = in + in_size - 8; // within 8 => be extra safe
|
|
int i, j;
|
|
unsigned int x, y;
|
|
uint16_t sfreq[TOTFREQ+32];
|
|
uint16_t sbase[TOTFREQ+32]; // faster to use 32-bit on clang
|
|
uint8_t ssym [TOTFREQ+64]; // faster to use 16-bit on clang
|
|
|
|
if (!out)
|
|
out_free = out = malloc(out_sz);
|
|
if (!out)
|
|
return NULL;
|
|
|
|
// Precompute reverse lookup of frequency.
|
|
uint32_t F[256] = {0}, fsum;
|
|
int fsz = decode_freq(cp, cp_end, F, &fsum);
|
|
if (!fsz)
|
|
goto err;
|
|
cp += fsz;
|
|
|
|
normalise_freq_shift(F, fsum, TOTFREQ);
|
|
|
|
// Build symbols; fixme, do as part of decode, see the _d variant
|
|
for (j = x = 0; j < 256; j++) {
|
|
if (F[j]) {
|
|
if (F[j] > TOTFREQ - x)
|
|
goto err;
|
|
for (y = 0; y < F[j]; y++) {
|
|
ssym [y + x] = j;
|
|
sfreq[y + x] = F[j];
|
|
sbase[y + x] = y;
|
|
}
|
|
x += F[j];
|
|
}
|
|
}
|
|
|
|
if (x != TOTFREQ)
|
|
goto err;
|
|
|
|
if (cp+16 > cp_end+8)
|
|
goto err;
|
|
|
|
RansState R[4];
|
|
RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto err;
|
|
RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto err;
|
|
RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto err;
|
|
RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto err;
|
|
|
|
// Simple version is comparable to below, but only with -O3
|
|
//
|
|
// for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
|
|
// for(j=0; j<8;j++) {
|
|
// RansState m = RansDecGet(&R[j%4], TF_SHIFT);
|
|
// R[j%4] = sfreq[m] * (R[j%4] >> TF_SHIFT) + sbase[m];
|
|
// out[i+j] = ssym[m];
|
|
// RansDecRenorm(&R[j%4], &cp);
|
|
// }
|
|
// }
|
|
|
|
for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
|
|
for (j = 0; j < 8; j+=4) {
|
|
RansState m0 = RansDecGet(&R[0], TF_SHIFT);
|
|
RansState m1 = RansDecGet(&R[1], TF_SHIFT);
|
|
out[i+j+0] = ssym[m0];
|
|
out[i+j+1] = ssym[m1];
|
|
|
|
R[0] = sfreq[m0] * (R[0] >> TF_SHIFT) + sbase[m0];
|
|
R[1] = sfreq[m1] * (R[1] >> TF_SHIFT) + sbase[m1];
|
|
|
|
RansState m2 = RansDecGet(&R[2], TF_SHIFT);
|
|
RansState m3 = RansDecGet(&R[3], TF_SHIFT);
|
|
|
|
RansDecRenorm(&R[0], &cp);
|
|
RansDecRenorm(&R[1], &cp);
|
|
|
|
R[2] = sfreq[m2] * (R[2] >> TF_SHIFT) + sbase[m2];
|
|
R[3] = sfreq[m3] * (R[3] >> TF_SHIFT) + sbase[m3];
|
|
|
|
RansDecRenorm(&R[2], &cp);
|
|
RansDecRenorm(&R[3], &cp);
|
|
|
|
out[i+j+2] = ssym[m2];
|
|
out[i+j+3] = ssym[m3];
|
|
}
|
|
}
|
|
|
|
// remainder
|
|
for (; i < out_sz; i++) {
|
|
RansState m = RansDecGet(&R[i%4], TF_SHIFT);
|
|
R[i%4] = sfreq[m] * (R[i%4] >> TF_SHIFT) + sbase[m];
|
|
out[i] = ssym[m];
|
|
RansDecRenormSafe(&R[i%4], &cp, cp_end+8);
|
|
}
|
|
|
|
//fprintf(stderr, " 0 Decoded %d bytes\n", (int)(cp-in)); //c-size
|
|
|
|
return out;
|
|
|
|
err:
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Compute the entropy of 12-bit vs 10-bit frequency tables.
|
|
// 10 bit means smaller memory footprint when decoding and
|
|
// more speed due to cache hits, but it *may* be a poor
|
|
// compression fit.
|
|
int rans_compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
|
|
uint32_t *S) {
|
|
int i, j;
|
|
|
|
double e10 = 0, e12 = 0;
|
|
int max_tot = 0;
|
|
for (i = 0; i < 256; i++) {
|
|
if (F0[i] == 0)
|
|
continue;
|
|
unsigned int max_val = round2(T[i]);
|
|
int ns = 0;
|
|
#define MAX(a,b) ((a)>(b)?(a):(b))
|
|
|
|
// Number of samples that get their freq bumped to 1
|
|
int sm10 = 0, sm12 = 0;
|
|
for (j = 0; j < 256; j++) {
|
|
if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1_FAST)
|
|
sm10++;
|
|
if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1)
|
|
sm12++;
|
|
}
|
|
|
|
double l10 = log(TOTFREQ_O1_FAST + sm10);
|
|
double l12 = log(TOTFREQ_O1 + sm12);
|
|
double T_slow = (double)TOTFREQ_O1/T[i];
|
|
double T_fast = (double)TOTFREQ_O1_FAST/T[i];
|
|
|
|
for (j = 0; j < 256; j++) {
|
|
if (F[i][j]) {
|
|
ns++;
|
|
|
|
e10 -= F[i][j] * (fast_log(MAX(F[i][j]*T_fast,1)) - l10);
|
|
e12 -= F[i][j] * (fast_log(MAX(F[i][j]*T_slow,1)) - l12);
|
|
|
|
// Estimation of compressed symbol freq table too.
|
|
e10 += 1.3;
|
|
e12 += 4.7;
|
|
}
|
|
}
|
|
|
|
// Order-1 frequencies often end up totalling under TOTFREQ.
|
|
// In this case it's smaller to output the real frequencies
|
|
// prior to normalisation and normalise after (with an extra
|
|
// normalisation step needed in the decoder too).
|
|
//
|
|
// Thus we normalise to a power of 2 only, store those,
|
|
// and renormalise later here (and in decoder) by bit-shift
|
|
// to get to the fixed size.
|
|
if (ns < 64 && max_val > 128) max_val /= 2;
|
|
if (max_val > 1024) max_val /= 2;
|
|
if (max_val > TOTFREQ_O1) max_val = TOTFREQ_O1;
|
|
S[i] = max_val; // scale to max this
|
|
if (max_tot < max_val)
|
|
max_tot = max_val;
|
|
}
|
|
int shift = e10/e12 < 1.01 || max_tot <= TOTFREQ_O1_FAST
|
|
? TF_SHIFT_O1_FAST
|
|
: TF_SHIFT_O1;
|
|
|
|
// fprintf(stderr, "e10/12 = %f %f %f, shift %d\n",
|
|
// e10/log(256), e12/log(256), e10/e12, shift);
|
|
|
|
return shift;
|
|
}
|
|
|
|
static
|
|
unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out, unsigned int *out_size) {
|
|
unsigned char *cp, *out_end, *out_free = NULL;
|
|
unsigned int tab_size;
|
|
|
|
// -20 for order/size/meta
|
|
uint32_t bound = rans_compress_bound_4x16(in_size,1)-20;
|
|
|
|
if (!out) {
|
|
*out_size = bound;
|
|
out_free = out = malloc(*out_size);
|
|
}
|
|
if (!out || bound > *out_size)
|
|
return NULL;
|
|
|
|
if (((size_t)out)&1)
|
|
bound--;
|
|
out_end = out + bound;
|
|
|
|
RansEncSymbol (*syms)[256] = htscodecs_tls_alloc(256 * (sizeof(*syms)));
|
|
if (!syms) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
|
|
cp = out;
|
|
int shift = encode_freq1(in, in_size, 4, syms, &cp);
|
|
if (shift < 0) {
|
|
htscodecs_tls_free(syms);
|
|
return NULL;
|
|
}
|
|
tab_size = cp - out;
|
|
|
|
RansState rans0, rans1, rans2, rans3;
|
|
RansEncInit(&rans0);
|
|
RansEncInit(&rans1);
|
|
RansEncInit(&rans2);
|
|
RansEncInit(&rans3);
|
|
|
|
uint8_t* ptr = out_end;
|
|
|
|
int isz4 = in_size>>2;
|
|
int i0 = 1*isz4-2;
|
|
int i1 = 2*isz4-2;
|
|
int i2 = 3*isz4-2;
|
|
int i3 = 4*isz4-2;
|
|
|
|
unsigned char l0 = in[i0+1];
|
|
unsigned char l1 = in[i1+1];
|
|
unsigned char l2 = in[i2+1];
|
|
unsigned char l3 = in[i3+1];
|
|
|
|
// Deal with the remainder
|
|
l3 = in[in_size-1];
|
|
for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
|
|
unsigned char c3 = in[i3];
|
|
RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
|
|
l3 = c3;
|
|
}
|
|
|
|
for (; i0 >= 0; i0--, i1--, i2--, i3--) {
|
|
unsigned char c0, c1, c2, c3;
|
|
RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
|
|
RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
|
|
RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
|
|
RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
|
|
|
|
RansEncPutSymbol(&rans3, &ptr, s3);
|
|
RansEncPutSymbol(&rans2, &ptr, s2);
|
|
RansEncPutSymbol(&rans1, &ptr, s1);
|
|
RansEncPutSymbol(&rans0, &ptr, s0);
|
|
|
|
l0 = c0;
|
|
l1 = c1;
|
|
l2 = c2;
|
|
l3 = c3;
|
|
}
|
|
|
|
RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
|
|
RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
|
|
RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
|
|
RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
|
|
|
|
RansEncFlush(&rans3, &ptr);
|
|
RansEncFlush(&rans2, &ptr);
|
|
RansEncFlush(&rans1, &ptr);
|
|
RansEncFlush(&rans0, &ptr);
|
|
|
|
*out_size = (out_end - ptr) + tab_size;
|
|
|
|
cp = out;
|
|
memmove(out + tab_size, ptr, out_end-ptr);
|
|
|
|
htscodecs_tls_free(syms);
|
|
return out;
|
|
}
|
|
|
|
//#define MAGIC2 111
|
|
#define MAGIC2 179
|
|
//#define MAGIC2 0
|
|
|
|
static
|
|
unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out, unsigned int out_sz) {
|
|
if (in_size < 16) // 4-states at least
|
|
return NULL;
|
|
|
|
if (out_sz >= INT_MAX)
|
|
return NULL; // protect against some overflow cases
|
|
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (out_sz > 100000)
|
|
return NULL;
|
|
#endif
|
|
|
|
/* Load in the static tables */
|
|
unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
|
|
unsigned char *c_freq = NULL;
|
|
int i, j = -999;
|
|
unsigned int x;
|
|
|
|
uint8_t *sfb_ = htscodecs_tls_alloc(256*(TOTFREQ_O1+MAGIC2)*sizeof(*sfb_));
|
|
uint32_t (*s3)[TOTFREQ_O1_FAST] = (uint32_t (*)[TOTFREQ_O1_FAST])sfb_;
|
|
// reuse the same memory for the fast mode lookup, but this only works
|
|
// if we're on e.g. 12-bit freqs vs 10-bit freqs as needs 4x larger array.
|
|
//uint32_t s3[256][TOTFREQ_O1_FAST];
|
|
|
|
if (!sfb_)
|
|
return NULL;
|
|
fb_t (*fb)[256] = htscodecs_tls_alloc(256 * sizeof(*fb));
|
|
if (!fb)
|
|
goto err;
|
|
uint8_t *sfb[256];
|
|
if ((*cp >> 4) == TF_SHIFT_O1) {
|
|
for (i = 0; i < 256; i++)
|
|
sfb[i]= sfb_ + i*(TOTFREQ_O1+MAGIC2);
|
|
} else {
|
|
for (i = 0; i < 256; i++)
|
|
sfb[i]= sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2);
|
|
}
|
|
|
|
if (!out)
|
|
out_free = out = malloc(out_sz);
|
|
|
|
if (!out)
|
|
goto err;
|
|
|
|
//fprintf(stderr, "out_sz=%d\n", out_sz);
|
|
|
|
// compressed header? If so uncompress it
|
|
unsigned char *tab_end = NULL;
|
|
unsigned char *c_freq_end = cp_end;
|
|
unsigned int shift = *cp >> 4;
|
|
if (*cp++ & 1) {
|
|
uint32_t u_freq_sz, c_freq_sz;
|
|
cp += var_get_u32(cp, cp_end, &u_freq_sz);
|
|
cp += var_get_u32(cp, cp_end, &c_freq_sz);
|
|
if (c_freq_sz > cp_end - cp)
|
|
goto err;
|
|
tab_end = cp + c_freq_sz;
|
|
if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz)))
|
|
goto err;
|
|
cp = c_freq;
|
|
c_freq_end = c_freq + u_freq_sz;
|
|
}
|
|
|
|
// Decode order-0 symbol list; avoids needing in order-1 tables
|
|
uint32_t F0[256] = {0};
|
|
int fsz = decode_alphabet(cp, c_freq_end, F0);
|
|
if (!fsz)
|
|
goto err;
|
|
cp += fsz;
|
|
|
|
if (cp >= c_freq_end)
|
|
goto err;
|
|
|
|
const int s3_fast_on = in_size >= 100000;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
if (F0[i] == 0)
|
|
continue;
|
|
|
|
uint32_t F[256] = {0}, T = 0;
|
|
fsz = decode_freq_d(cp, c_freq_end, F0, F, &T);
|
|
if (!fsz)
|
|
goto err;
|
|
cp += fsz;
|
|
|
|
if (!T) {
|
|
//fprintf(stderr, "No freq for F_%d\n", i);
|
|
continue;
|
|
}
|
|
|
|
normalise_freq_shift(F, T, 1<<shift);
|
|
|
|
// Build symbols; fixme, do as part of decode, see the _d variant
|
|
for (j = x = 0; j < 256; j++) {
|
|
if (F[j]) {
|
|
if (F[j] > (1<<shift) - x)
|
|
goto err;
|
|
|
|
if (shift == TF_SHIFT_O1_FAST && s3_fast_on) {
|
|
int y;
|
|
for (y = 0; y < F[j]; y++)
|
|
s3[i][y+x] = (((uint32_t)F[j])<<(shift+8)) |(y<<8) |j;
|
|
} else {
|
|
memset(&sfb[i][x], j, F[j]);
|
|
fb[i][j].f = F[j];
|
|
fb[i][j].b = x;
|
|
}
|
|
x += F[j];
|
|
}
|
|
}
|
|
if (x != (1<<shift))
|
|
goto err;
|
|
}
|
|
|
|
if (tab_end)
|
|
cp = tab_end;
|
|
free(c_freq);
|
|
c_freq = NULL;
|
|
|
|
if (cp+16 > cp_end)
|
|
goto err;
|
|
|
|
RansState rans0, rans1, rans2, rans3;
|
|
uint8_t *ptr = cp, *ptr_end = in + in_size - 8;
|
|
RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto err;
|
|
RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto err;
|
|
RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto err;
|
|
RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto err;
|
|
|
|
unsigned int isz4 = out_sz>>2;
|
|
int l0 = 0, l1 = 0, l2 = 0, l3 = 0;
|
|
unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
|
|
|
|
RansState R[4];
|
|
R[0] = rans0;
|
|
R[1] = rans1;
|
|
R[2] = rans2;
|
|
R[3] = rans3;
|
|
|
|
// Around 15% faster to specialise for 10/12 than to have one
|
|
// loop with shift as a variable.
|
|
if (shift == TF_SHIFT_O1) {
|
|
// TF_SHIFT_O1 = 12
|
|
|
|
const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
|
|
for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
|
|
uint16_t m, c;
|
|
c = sfb[l0][m = R[0] & mask];
|
|
R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1) + m - fb[l0][c].b;
|
|
out[i4[0]] = l0 = c;
|
|
|
|
c = sfb[l1][m = R[1] & mask];
|
|
R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1) + m - fb[l1][c].b;
|
|
out[i4[1]] = l1 = c;
|
|
|
|
c = sfb[l2][m = R[2] & mask];
|
|
R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1) + m - fb[l2][c].b;
|
|
out[i4[2]] = l2 = c;
|
|
|
|
c = sfb[l3][m = R[3] & mask];
|
|
R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1) + m - fb[l3][c].b;
|
|
out[i4[3]] = l3 = c;
|
|
|
|
if (ptr < ptr_end) {
|
|
RansDecRenorm(&R[0], &ptr);
|
|
RansDecRenorm(&R[1], &ptr);
|
|
RansDecRenorm(&R[2], &ptr);
|
|
RansDecRenorm(&R[3], &ptr);
|
|
} else {
|
|
RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
|
|
}
|
|
}
|
|
|
|
// Remainder
|
|
for (; i4[3] < out_sz; i4[3]++) {
|
|
uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1)-1);
|
|
unsigned char c3 = sfb[l3][m3];
|
|
out[i4[3]] = c3;
|
|
R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1) + m3 - fb[l3][c3].b;
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
|
|
l3 = c3;
|
|
}
|
|
} else if (!s3_fast_on) {
|
|
// TF_SHIFT_O1 = 10 with sfb[256][1024] & fb[256]256] array lookup
|
|
// Slightly faster for -o193 on q4 (high comp), but also less
|
|
// initialisation cost for smaller data
|
|
const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
|
|
for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
|
|
uint16_t m, c;
|
|
c = sfb[l0][m = R[0] & mask];
|
|
R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1_FAST) + m - fb[l0][c].b;
|
|
out[i4[0]] = l0 = c;
|
|
|
|
c = sfb[l1][m = R[1] & mask];
|
|
R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1_FAST) + m - fb[l1][c].b;
|
|
out[i4[1]] = l1 = c;
|
|
|
|
c = sfb[l2][m = R[2] & mask];
|
|
R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1_FAST) + m - fb[l2][c].b;
|
|
out[i4[2]] = l2 = c;
|
|
|
|
c = sfb[l3][m = R[3] & mask];
|
|
R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1_FAST) + m - fb[l3][c].b;
|
|
out[i4[3]] = l3 = c;
|
|
|
|
if (ptr < ptr_end) {
|
|
RansDecRenorm(&R[0], &ptr);
|
|
RansDecRenorm(&R[1], &ptr);
|
|
RansDecRenorm(&R[2], &ptr);
|
|
RansDecRenorm(&R[3], &ptr);
|
|
} else {
|
|
RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
|
|
}
|
|
}
|
|
|
|
// Remainder
|
|
for (; i4[3] < out_sz; i4[3]++) {
|
|
uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1_FAST)-1);
|
|
unsigned char c3 = sfb[l3][m3];
|
|
out[i4[3]] = c3;
|
|
R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1_FAST) + m3 - fb[l3][c3].b;
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
|
|
l3 = c3;
|
|
}
|
|
} else {
|
|
// TF_SHIFT_O1_FAST.
|
|
// Significantly faster for -o1 on q40 (low comp).
|
|
// Higher initialisation cost, so only use if big blocks.
|
|
const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
|
|
for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
|
|
uint32_t S0 = s3[l0][R[0] & mask];
|
|
uint32_t S1 = s3[l1][R[1] & mask];
|
|
l0 = out[i4[0]] = S0;
|
|
l1 = out[i4[1]] = S1;
|
|
uint16_t F0 = S0>>(TF_SHIFT_O1_FAST+8);
|
|
uint16_t F1 = S1>>(TF_SHIFT_O1_FAST+8);
|
|
uint16_t B0 = (S0>>8) & mask;
|
|
uint16_t B1 = (S1>>8) & mask;
|
|
|
|
R[0] = F0 * (R[0]>>TF_SHIFT_O1_FAST) + B0;
|
|
R[1] = F1 * (R[1]>>TF_SHIFT_O1_FAST) + B1;
|
|
|
|
uint32_t S2 = s3[l2][R[2] & mask];
|
|
uint32_t S3 = s3[l3][R[3] & mask];
|
|
l2 = out[i4[2]] = S2;
|
|
l3 = out[i4[3]] = S3;
|
|
uint16_t F2 = S2>>(TF_SHIFT_O1_FAST+8);
|
|
uint16_t F3 = S3>>(TF_SHIFT_O1_FAST+8);
|
|
uint16_t B2 = (S2>>8) & mask;
|
|
uint16_t B3 = (S3>>8) & mask;
|
|
|
|
R[2] = F2 * (R[2]>>TF_SHIFT_O1_FAST) + B2;
|
|
R[3] = F3 * (R[3]>>TF_SHIFT_O1_FAST) + B3;
|
|
|
|
if (ptr < ptr_end) {
|
|
RansDecRenorm(&R[0], &ptr);
|
|
RansDecRenorm(&R[1], &ptr);
|
|
RansDecRenorm(&R[2], &ptr);
|
|
RansDecRenorm(&R[3], &ptr);
|
|
} else {
|
|
RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
|
|
}
|
|
}
|
|
|
|
// Remainder
|
|
for (; i4[3] < out_sz; i4[3]++) {
|
|
uint32_t S = s3[l3][R[3] & ((1u<<TF_SHIFT_O1_FAST)-1)];
|
|
l3 = out[i4[3]] = S;
|
|
R[3] = (S>>(TF_SHIFT_O1_FAST+8)) * (R[3]>>TF_SHIFT_O1_FAST)
|
|
+ ((S>>8) & ((1u<<TF_SHIFT_O1_FAST)-1));
|
|
RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
|
|
}
|
|
}
|
|
//fprintf(stderr, " 1 Decoded %d bytes\n", (int)(ptr-in)); //c-size
|
|
|
|
htscodecs_tls_free(fb);
|
|
htscodecs_tls_free(sfb_);
|
|
return out;
|
|
|
|
err:
|
|
htscodecs_tls_free(fb);
|
|
htscodecs_tls_free(sfb_);
|
|
free(out_free);
|
|
free(c_freq);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------------
|
|
* r32x16 implementation, included here for now for simplicity
|
|
*/
|
|
#include "rANS_static32x16pr.h"
|
|
|
|
static int rans_cpu = 0xFFFF; // all
|
|
|
|
#if defined(__x86_64__) && \
|
|
defined(HAVE_DECL___CPUID_COUNT) && HAVE_DECL___CPUID_COUNT && \
|
|
defined(HAVE_DECL___GET_CPUID_MAX) && HAVE_DECL___GET_CPUID_MAX
|
|
#include <cpuid.h>
|
|
|
|
#if defined(__clang__) && defined(__has_attribute)
|
|
# if __has_attribute(unused)
|
|
# define UNUSED __attribute__((unused))
|
|
# else
|
|
# define UNUSED
|
|
# endif
|
|
#elif defined(__GNUC__) && __GNUC__ >= 3
|
|
# define UNUSED __attribute__((unused))
|
|
#else
|
|
# define UNUSED
|
|
#endif
|
|
|
|
// CPU detection is performed once. NB this has an assumption that we're
|
|
// not migrating between processes with different instruction stes, but
|
|
// to date the only systems I know of that support this don't have different
|
|
// capabilities (that we use) per core.
|
|
#ifndef NO_THREADS
|
|
static pthread_once_t rans_cpu_once = PTHREAD_ONCE_INIT;
|
|
#endif
|
|
|
|
static int have_ssse3 UNUSED = 0;
|
|
static int have_sse4_1 UNUSED = 0;
|
|
static int have_popcnt UNUSED = 0;
|
|
static int have_avx2 UNUSED = 0;
|
|
static int have_avx512f UNUSED = 0;
|
|
static int is_amd UNUSED = 0;
|
|
|
|
#define HAVE_HTSCODECS_TLS_CPU_INIT
|
|
static void htscodecs_tls_cpu_init(void) {
|
|
unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
|
|
// These may be unused, depending on HAVE_* config.h macros
|
|
|
|
int level = __get_cpuid_max(0, NULL);
|
|
__cpuid_count(0, 0, eax, ebx, ecx, edx);
|
|
is_amd = (ecx == 0x444d4163);
|
|
if (level >= 1) {
|
|
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
|
#if defined(bit_SSSE3)
|
|
have_ssse3 = ecx & bit_SSSE3;
|
|
#endif
|
|
#if defined(bit_POPCNT)
|
|
have_popcnt = ecx & bit_POPCNT;
|
|
#endif
|
|
#if defined(bit_SSE4_1)
|
|
have_sse4_1 = ecx & bit_SSE4_1;
|
|
#endif
|
|
}
|
|
if (level >= 7) {
|
|
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
|
#if defined(bit_AVX2)
|
|
have_avx2 = ebx & bit_AVX2;
|
|
#endif
|
|
#if defined(bit_AVX512F)
|
|
have_avx512f = ebx & bit_AVX512F;
|
|
#endif
|
|
}
|
|
|
|
if (!have_popcnt) have_avx512f = have_avx2 = have_sse4_1 = 0;
|
|
if (!have_ssse3) have_sse4_1 = 0;
|
|
}
|
|
|
|
static inline
|
|
unsigned char *(*rans_enc_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int *out_size) {
|
|
|
|
int have_e_sse4_1 = have_sse4_1;
|
|
int have_e_avx2 = have_avx2;
|
|
int have_e_avx512f = have_avx512f;
|
|
|
|
if (!(rans_cpu & RANS_CPU_ENC_AVX512)) have_e_avx512f = 0;
|
|
if (!(rans_cpu & RANS_CPU_ENC_AVX2)) have_e_avx2 = 0;
|
|
if (!(rans_cpu & RANS_CPU_ENC_SSE4)) have_e_sse4_1 = 0;
|
|
|
|
if (!do_simd) { // SIMD disabled
|
|
return order & 1
|
|
? rans_compress_O1_4x16
|
|
: rans_compress_O0_4x16;
|
|
}
|
|
|
|
#ifdef NO_THREADS
|
|
htscodecs_tls_cpu_init();
|
|
#else
|
|
int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init);
|
|
if (err != 0) {
|
|
fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n",
|
|
strerror(err));
|
|
fprintf(stderr, "Using scalar code only\n");
|
|
}
|
|
#endif
|
|
|
|
if (order & 1) {
|
|
// With simulated gathers, the AVX512 is now slower than AVX2, so
|
|
// we avoid using it unless asking for the real avx512 gather.
|
|
// Note for testing we do -c 0x0404 to enable AVX512 and disable AVX2.
|
|
// We then need to call the avx512 func regardless.
|
|
int use_gather;
|
|
#ifdef USE_GATHER
|
|
use_gather = 1;
|
|
#else
|
|
use_gather = !have_e_avx2;
|
|
#endif
|
|
|
|
#if defined(HAVE_AVX512)
|
|
if (have_e_avx512f && (!is_amd || !have_e_avx2) && use_gather)
|
|
return rans_compress_O1_32x16_avx512;
|
|
#endif
|
|
#if defined(HAVE_AVX2)
|
|
if (have_e_avx2)
|
|
return rans_compress_O1_32x16_avx2;
|
|
#endif
|
|
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
|
|
if (have_e_sse4_1)
|
|
return rans_compress_O1_32x16;
|
|
#endif
|
|
return rans_compress_O1_32x16;
|
|
} else {
|
|
#if defined(HAVE_AVX512)
|
|
if (have_e_avx512f && (!is_amd || !have_e_avx2))
|
|
return rans_compress_O0_32x16_avx512;
|
|
#endif
|
|
#if defined(HAVE_AVX2)
|
|
if (have_e_avx2)
|
|
return rans_compress_O0_32x16_avx2;
|
|
#endif
|
|
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
|
|
if (have_e_sse4_1)
|
|
return rans_compress_O0_32x16;
|
|
#endif
|
|
return rans_compress_O0_32x16;
|
|
}
|
|
}
|
|
|
|
static inline
|
|
unsigned char *(*rans_dec_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int out_size) {
|
|
|
|
int have_d_sse4_1 = have_sse4_1;
|
|
int have_d_avx2 = have_avx2;
|
|
int have_d_avx512f = have_avx512f;
|
|
|
|
if (!(rans_cpu & RANS_CPU_DEC_AVX512)) have_d_avx512f = 0;
|
|
if (!(rans_cpu & RANS_CPU_DEC_AVX2)) have_d_avx2 = 0;
|
|
if (!(rans_cpu & RANS_CPU_DEC_SSE4)) have_d_sse4_1 = 0;
|
|
|
|
if (!do_simd) { // SIMD disabled
|
|
return order & 1
|
|
? rans_uncompress_O1_4x16
|
|
: rans_uncompress_O0_4x16;
|
|
}
|
|
|
|
#ifdef NO_THREADS
|
|
htscodecs_tls_cpu_init();
|
|
#else
|
|
int err = pthread_once(&rans_cpu_once, htscodecs_tls_cpu_init);
|
|
if (err != 0) {
|
|
fprintf(stderr, "Initialising TLS data failed: pthread_once: %s\n",
|
|
strerror(err));
|
|
fprintf(stderr, "Using scalar code only\n");
|
|
}
|
|
#endif
|
|
|
|
if (order & 1) {
|
|
#if defined(HAVE_AVX512)
|
|
if (have_d_avx512f)
|
|
return rans_uncompress_O1_32x16_avx512;
|
|
#endif
|
|
#if defined(HAVE_AVX2)
|
|
if (have_d_avx2)
|
|
return rans_uncompress_O1_32x16_avx2;
|
|
#endif
|
|
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
|
|
if (have_d_sse4_1)
|
|
return rans_uncompress_O1_32x16_sse4;
|
|
#endif
|
|
return rans_uncompress_O1_32x16;
|
|
} else {
|
|
#if defined(HAVE_AVX512)
|
|
if (have_d_avx512f)
|
|
return rans_uncompress_O0_32x16_avx512;
|
|
#endif
|
|
#if defined(HAVE_AVX2)
|
|
if (have_d_avx2)
|
|
return rans_uncompress_O0_32x16_avx2;
|
|
#endif
|
|
#if defined(HAVE_SSE4_1) && defined(HAVE_SSSE3) && defined(HAVE_POPCNT)
|
|
if (have_d_sse4_1)
|
|
return rans_uncompress_O0_32x16_sse4;
|
|
#endif
|
|
return rans_uncompress_O0_32x16;
|
|
}
|
|
}
|
|
|
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
|
|
#if defined(__linux__) || defined(__FreeBSD__)
|
|
#include <sys/auxv.h>
|
|
#elif defined(_WIN32)
|
|
#include <processthreadsapi.h>
|
|
#endif
|
|
|
|
static inline int have_neon(void) {
|
|
#if defined(__linux__) && defined(__arm__)
|
|
return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
|
|
#elif defined(__linux__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
|
|
return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
|
|
#elif defined(__APPLE__)
|
|
return 1;
|
|
#elif defined(__FreeBSD__) && defined(__arm__)
|
|
unsigned long cap;
|
|
if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
|
|
return (cap & HWCAP_NEON) != 0;
|
|
#elif defined(__FreeBSD__) && defined(__aarch64__) && defined(HWCAP_ASIMD)
|
|
unsigned long cap;
|
|
if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
|
|
return (cap & HWCAP_ASIMD) != 0;
|
|
#elif defined(_WIN32)
|
|
return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
unsigned char *(*rans_enc_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int *out_size) {
|
|
|
|
if (do_simd) {
|
|
if ((rans_cpu & RANS_CPU_ENC_NEON) && have_neon())
|
|
return order & 1
|
|
? rans_compress_O1_32x16_neon
|
|
: rans_compress_O0_32x16_neon;
|
|
else
|
|
return order & 1
|
|
? rans_compress_O1_32x16
|
|
: rans_compress_O0_32x16;
|
|
} else {
|
|
return order & 1
|
|
? rans_compress_O1_4x16
|
|
: rans_compress_O0_4x16;
|
|
}
|
|
}
|
|
|
|
static inline
|
|
unsigned char *(*rans_dec_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int out_size) {
|
|
|
|
if (do_simd) {
|
|
if ((rans_cpu & RANS_CPU_DEC_NEON) && have_neon())
|
|
return order & 1
|
|
? rans_uncompress_O1_32x16_neon
|
|
: rans_uncompress_O0_32x16_neon;
|
|
else
|
|
return order & 1
|
|
? rans_uncompress_O1_32x16
|
|
: rans_uncompress_O0_32x16;
|
|
} else {
|
|
return order & 1
|
|
? rans_uncompress_O1_4x16
|
|
: rans_uncompress_O0_4x16;
|
|
}
|
|
}
|
|
|
|
#else // !(defined(__GNUC__) && defined(__x86_64__)) && !defined(__ARM_NEON)
|
|
|
|
static inline
|
|
unsigned char *(*rans_enc_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int *out_size) {
|
|
|
|
if (do_simd) {
|
|
return order & 1
|
|
? rans_compress_O1_32x16
|
|
: rans_compress_O0_32x16;
|
|
} else {
|
|
return order & 1
|
|
? rans_compress_O1_4x16
|
|
: rans_compress_O0_4x16;
|
|
}
|
|
}
|
|
|
|
static inline
|
|
unsigned char *(*rans_dec_func(int do_simd, int order))
|
|
(unsigned char *in,
|
|
unsigned int in_size,
|
|
unsigned char *out,
|
|
unsigned int out_size) {
|
|
|
|
if (do_simd) {
|
|
return order & 1
|
|
? rans_uncompress_O1_32x16
|
|
: rans_uncompress_O0_32x16;
|
|
} else {
|
|
return order & 1
|
|
? rans_uncompress_O1_4x16
|
|
: rans_uncompress_O0_4x16;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
// Test interface for restricting the auto-detection methods so we
|
|
// can forcibly compare different implementations on the same machine.
|
|
// See RANS_CPU_ defines in rANS_static4x16.h
|
|
void rans_set_cpu(int opts) {
|
|
rans_cpu = opts;
|
|
#ifdef HAVE_HTSCODECS_TLS_CPU_INIT
|
|
htscodecs_tls_cpu_init();
|
|
#endif
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------------
|
|
* Simple interface to the order-0 vs order-1 encoders and decoders.
|
|
*
|
|
* Smallest is method, <in_size> <input>, so worst case 2 bytes longer.
|
|
*/
|
|
unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out,unsigned int *out_size,
|
|
int order) {
|
|
if (in_size > INT_MAX) {
|
|
*out_size = 0;
|
|
return NULL;
|
|
}
|
|
|
|
unsigned int c_meta_len;
|
|
uint8_t *meta = NULL, *rle = NULL, *packed = NULL;
|
|
uint8_t *out_free = NULL;
|
|
|
|
if (!out) {
|
|
*out_size = rans_compress_bound_4x16(in_size, order);
|
|
if (*out_size == 0)
|
|
return NULL;
|
|
if (!(out_free = out = malloc(*out_size)))
|
|
return NULL;
|
|
}
|
|
|
|
unsigned char *out_end = out + *out_size;
|
|
|
|
// Permit 32-way unrolling for large blocks, paving the way for
|
|
// AVX2 and AVX512 SIMD variants.
|
|
if ((order & RANS_ORDER_SIMD_AUTO) && in_size >= 50000
|
|
&& !(order & RANS_ORDER_STRIPE))
|
|
order |= X_32;
|
|
|
|
if (in_size <= 20)
|
|
order &= ~RANS_ORDER_STRIPE;
|
|
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (in_size <= 1000)
|
|
order &= ~RANS_ORDER_X32;
|
|
#endif
|
|
if (order & RANS_ORDER_STRIPE) {
|
|
int N = (order>>8) & 0xff;
|
|
if (N == 0) N = 4; // default for compatibility with old tests
|
|
|
|
unsigned char *transposed = malloc(in_size);
|
|
unsigned int part_len[256];
|
|
unsigned int idx[256];
|
|
if (!transposed) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
int i, j, x;
|
|
|
|
for (i = 0; i < N; i++) {
|
|
part_len[i] = in_size / N + ((in_size % N) > i);
|
|
idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index
|
|
}
|
|
|
|
#define KN 8
|
|
i = x = 0;
|
|
if (in_size >= N*KN) {
|
|
for (; i < in_size-N*KN;) {
|
|
int k;
|
|
unsigned char *ink = in+i;
|
|
for (j = 0; j < N; j++)
|
|
for (k = 0; k < KN; k++)
|
|
transposed[idx[j]+x+k] = ink[j+N*k];
|
|
x += KN; i+=N*KN;
|
|
}
|
|
}
|
|
#undef KN
|
|
for (; i < in_size-N; i += N, x++) {
|
|
for (j = 0; j < N; j++)
|
|
transposed[idx[j]+x] = in[i+j];
|
|
}
|
|
|
|
for (; i < in_size; i += N, x++) {
|
|
for (j = 0; i+j < in_size; j++)
|
|
transposed[idx[j]+x] = in[i+j];
|
|
}
|
|
|
|
unsigned int olen2;
|
|
unsigned char *out2, *out2_start;
|
|
c_meta_len = 1;
|
|
*out = order & ~RANS_ORDER_NOSZ;
|
|
c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size);
|
|
out[c_meta_len++] = N;
|
|
|
|
unsigned char *out_best = NULL;
|
|
unsigned int out_best_len = 0;
|
|
|
|
out2_start = out2 = out+7+5*N; // shares a buffer with c_meta
|
|
for (i = 0; i < N; i++) {
|
|
// Brute force try all methods.
|
|
int j, m[] = {1,64,128,0}, best_j = 0, best_sz = in_size+10;
|
|
for (j = 0; j < sizeof(m)/sizeof(*m); j++) {
|
|
if ((order & m[j]) != m[j])
|
|
continue;
|
|
|
|
// order-1 *only*; bit check above cannot elide order-0
|
|
if ((order & RANS_ORDER_STRIPE_NO0) && (m[j]&1) == 0)
|
|
continue;
|
|
olen2 = *out_size - (out2 - out);
|
|
rans_compress_to_4x16(transposed+idx[i], part_len[i],
|
|
out2, &olen2,
|
|
m[j] | RANS_ORDER_NOSZ
|
|
| (order&RANS_ORDER_X32));
|
|
if (best_sz > olen2) {
|
|
best_sz = olen2;
|
|
best_j = j;
|
|
if (j < sizeof(m)/sizeof(*m) && olen2 > out_best_len) {
|
|
unsigned char *tmp = realloc(out_best, olen2);
|
|
if (!tmp) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
out_best = tmp;
|
|
out_best_len = olen2;
|
|
}
|
|
|
|
// Cache a copy of the best so far
|
|
memcpy(out_best, out2, olen2);
|
|
}
|
|
}
|
|
if (best_j < sizeof(m)/sizeof(*m)) {
|
|
// Copy the best compression to output buffer if not current
|
|
memcpy(out2, out_best, best_sz);
|
|
olen2 = best_sz;
|
|
}
|
|
|
|
out2 += olen2;
|
|
c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2);
|
|
}
|
|
if (out_best)
|
|
free(out_best);
|
|
|
|
memmove(out+c_meta_len, out2_start, out2-out2_start);
|
|
free(transposed);
|
|
*out_size = c_meta_len + out2-out2_start;
|
|
return out;
|
|
}
|
|
|
|
if (order & RANS_ORDER_CAT) {
|
|
out[0] = RANS_ORDER_CAT;
|
|
c_meta_len = 1;
|
|
c_meta_len += var_put_u32(&out[1], out_end, in_size);
|
|
if (in_size)
|
|
memcpy(out+c_meta_len, in, in_size);
|
|
*out_size = c_meta_len + in_size;
|
|
return out;
|
|
}
|
|
|
|
int do_pack = order & RANS_ORDER_PACK;
|
|
int do_rle = order & RANS_ORDER_RLE;
|
|
int no_size = order & RANS_ORDER_NOSZ;
|
|
int do_simd = order & RANS_ORDER_X32;
|
|
|
|
out[0] = order;
|
|
c_meta_len = 1;
|
|
|
|
if (!no_size)
|
|
c_meta_len += var_put_u32(&out[1], out_end, in_size);
|
|
|
|
order &= 3;
|
|
|
|
// Format is compressed meta-data, compressed data.
|
|
// Meta-data can be empty, pack, rle lengths, or pack + rle lengths.
|
|
// Data is either the original data, bit-packed packed, rle literals or
|
|
// packed + rle literals.
|
|
|
|
if (do_pack && in_size) {
|
|
// PACK 2, 4 or 8 symbols into one byte.
|
|
int pmeta_len;
|
|
uint64_t packed_len;
|
|
packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len);
|
|
if (!packed) {
|
|
out[0] &= ~RANS_ORDER_PACK;
|
|
do_pack = 0;
|
|
free(packed);
|
|
packed = NULL;
|
|
} else {
|
|
in = packed;
|
|
in_size = packed_len;
|
|
c_meta_len += pmeta_len;
|
|
|
|
// Could derive this rather than storing verbatim.
|
|
// Orig size * 8/nbits (+1 if not multiple of 8/n)
|
|
int sz = var_put_u32(out+c_meta_len, out_end, in_size);
|
|
c_meta_len += sz;
|
|
*out_size -= sz;
|
|
}
|
|
} else if (do_pack) {
|
|
out[0] &= ~RANS_ORDER_PACK;
|
|
}
|
|
|
|
if (do_rle && in_size) {
|
|
// RLE 'in' -> rle_length + rle_literals arrays
|
|
unsigned int rmeta_len, c_rmeta_len;
|
|
uint64_t rle_len;
|
|
c_rmeta_len = in_size+257;
|
|
if (!(meta = malloc(c_rmeta_len))) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
|
|
uint8_t rle_syms[256];
|
|
int rle_nsyms = 0;
|
|
uint64_t rmeta_len64;
|
|
rle = hts_rle_encode(in, in_size, meta, &rmeta_len64,
|
|
rle_syms, &rle_nsyms, NULL, &rle_len);
|
|
memmove(meta+1+rle_nsyms, meta, rmeta_len64);
|
|
meta[0] = rle_nsyms;
|
|
memcpy(meta+1, rle_syms, rle_nsyms);
|
|
rmeta_len = rmeta_len64 + rle_nsyms+1;
|
|
|
|
if (!rle || rle_len + rmeta_len >= .99*in_size) {
|
|
// Not worth the speed hit.
|
|
out[0] &= ~RANS_ORDER_RLE;
|
|
do_rle = 0;
|
|
free(rle);
|
|
rle = NULL;
|
|
} else {
|
|
// Compress lengths with O0 and literals with O0/O1 ("order" param)
|
|
int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2;
|
|
sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len);
|
|
c_rmeta_len = *out_size - (c_meta_len+sz+5);
|
|
rans_enc_func(do_simd, 0)(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len);
|
|
if (c_rmeta_len < rmeta_len) {
|
|
sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len);
|
|
memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len);
|
|
} else {
|
|
// Uncompressed RLE meta-data as too small
|
|
sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2+1);
|
|
sz2 = var_put_u32(out+c_meta_len+sz, out_end, rle_len);
|
|
memcpy(out+c_meta_len+sz+sz2, meta, rmeta_len);
|
|
c_rmeta_len = rmeta_len;
|
|
}
|
|
|
|
c_meta_len += sz + sz2 + c_rmeta_len;
|
|
|
|
in = rle;
|
|
in_size = rle_len;
|
|
}
|
|
|
|
free(meta);
|
|
} else if (do_rle) {
|
|
out[0] &= ~RANS_ORDER_RLE;
|
|
}
|
|
|
|
*out_size -= c_meta_len;
|
|
if (order && in_size < 8) {
|
|
out[0] &= ~1;
|
|
order &= ~1;
|
|
}
|
|
|
|
rans_enc_func(do_simd, order)(in, in_size, out+c_meta_len, out_size);
|
|
|
|
if (*out_size >= in_size) {
|
|
out[0] &= ~3;
|
|
out[0] |= RANS_ORDER_CAT | no_size;
|
|
if (in_size)
|
|
memcpy(out+c_meta_len, in, in_size);
|
|
*out_size = in_size;
|
|
}
|
|
|
|
free(rle);
|
|
free(packed);
|
|
|
|
*out_size += c_meta_len;
|
|
|
|
return out;
|
|
}
|
|
|
|
unsigned char *rans_compress_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned int *out_size, int order) {
|
|
return rans_compress_to_4x16(in, in_size, NULL, out_size, order);
|
|
}
|
|
|
|
unsigned char *rans_uncompress_to_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned char *out, unsigned int *out_size) {
|
|
unsigned char *in_end = in + in_size;
|
|
unsigned char *out_free = NULL, *tmp_free = NULL, *meta_free = NULL;
|
|
|
|
if (in_size == 0)
|
|
return NULL;
|
|
|
|
if (*in & RANS_ORDER_STRIPE) {
|
|
unsigned int ulen, olen, c_meta_len = 1;
|
|
int i;
|
|
uint64_t clen_tot = 0;
|
|
|
|
// Decode lengths
|
|
c_meta_len += var_get_u32(in+c_meta_len, in_end, &ulen);
|
|
if (c_meta_len >= in_size)
|
|
return NULL;
|
|
unsigned int N = in[c_meta_len++];
|
|
if (N < 1) // Must be at least one stripe
|
|
return NULL;
|
|
unsigned int clenN[256], ulenN[256], idxN[256];
|
|
if (!out) {
|
|
if (ulen >= INT_MAX)
|
|
return NULL;
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (ulen > 100000)
|
|
return NULL;
|
|
#endif
|
|
if (!(out_free = out = malloc(ulen))) {
|
|
return NULL;
|
|
}
|
|
*out_size = ulen;
|
|
}
|
|
if (ulen != *out_size) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; i < N; i++) {
|
|
ulenN[i] = ulen / N + ((ulen % N) > i);
|
|
idxN[i] = i ? idxN[i-1] + ulenN[i-1] : 0;
|
|
c_meta_len += var_get_u32(in+c_meta_len, in_end, &clenN[i]);
|
|
clen_tot += clenN[i];
|
|
if (c_meta_len > in_size || clenN[i] > in_size || clenN[i] < 1) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// We can call this with a larger buffer, but once we've determined
|
|
// how much we really use we limit it so the recursion becomes easier
|
|
// to limit.
|
|
if (c_meta_len + clen_tot > in_size) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
in_size = c_meta_len + clen_tot;
|
|
|
|
//fprintf(stderr, " stripe meta %d\n", c_meta_len); //c-size
|
|
|
|
// Uncompress the N streams
|
|
unsigned char *outN = malloc(ulen);
|
|
if (!outN) {
|
|
free(out_free);
|
|
return NULL;
|
|
}
|
|
for (i = 0; i < N; i++) {
|
|
olen = ulenN[i];
|
|
if (in_size < c_meta_len) {
|
|
free(out_free);
|
|
free(outN);
|
|
return NULL;
|
|
}
|
|
if (!rans_uncompress_to_4x16(in+c_meta_len, in_size-c_meta_len, outN + idxN[i], &olen)
|
|
|| olen != ulenN[i]) {
|
|
free(out_free);
|
|
free(outN);
|
|
return NULL;
|
|
}
|
|
c_meta_len += clenN[i];
|
|
}
|
|
|
|
unstripe(out, outN, ulen, N, idxN);
|
|
|
|
free(outN);
|
|
*out_size = ulen;
|
|
return out;
|
|
}
|
|
|
|
int order = *in++; in_size--;
|
|
int do_pack = order & RANS_ORDER_PACK;
|
|
int do_rle = order & RANS_ORDER_RLE;
|
|
int do_cat = order & RANS_ORDER_CAT;
|
|
int no_size = order & RANS_ORDER_NOSZ;
|
|
int do_simd = order & RANS_ORDER_X32;
|
|
order &= 1;
|
|
|
|
int sz = 0;
|
|
unsigned int osz;
|
|
if (!no_size) {
|
|
sz = var_get_u32(in, in_end, &osz);
|
|
} else
|
|
sz = 0, osz = *out_size;
|
|
in += sz;
|
|
in_size -= sz;
|
|
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (osz > 100000)
|
|
return NULL;
|
|
#endif
|
|
|
|
if (no_size && !out)
|
|
goto err; // Need one or the other
|
|
|
|
if (!out) {
|
|
*out_size = osz;
|
|
if (!(out = out_free = malloc(*out_size)))
|
|
return NULL;
|
|
} else {
|
|
if (*out_size < osz)
|
|
goto err;
|
|
*out_size = osz;
|
|
}
|
|
|
|
// if (do_pack || do_rle) {
|
|
// in += sz; // size field not needed when pure rANS
|
|
// in_size -= sz;
|
|
// }
|
|
|
|
uint32_t c_meta_size = 0;
|
|
unsigned int tmp1_size = *out_size;
|
|
unsigned int tmp2_size = *out_size;
|
|
unsigned int tmp3_size = *out_size;
|
|
unsigned char *tmp1 = NULL, *tmp2 = NULL, *tmp3 = NULL, *tmp = NULL;
|
|
|
|
// Need In, Out and Tmp buffers with temporary buffer of the same size
|
|
// as output. All use rANS, but with optional transforms (none, RLE,
|
|
// Pack, or both).
|
|
//
|
|
// rans unrle unpack
|
|
// If none: in -> out
|
|
// If RLE: in -> tmp -> out
|
|
// If Pack: in -> tmp -> out
|
|
// If RLE+Pack: in -> out -> tmp -> out
|
|
// tmp1 tmp2 tmp3
|
|
//
|
|
// So rans is in -> tmp1
|
|
// RLE is tmp1 -> tmp2
|
|
// Unpack is tmp2 -> tmp3
|
|
|
|
// Format is meta data (Pack and RLE in that order if present),
|
|
// followed by rANS compressed data.
|
|
|
|
if (do_pack || do_rle) {
|
|
if (!(tmp = tmp_free = malloc(*out_size)))
|
|
goto err;
|
|
if (do_pack && do_rle) {
|
|
tmp1 = out;
|
|
tmp2 = tmp;
|
|
tmp3 = out;
|
|
} else if (do_pack) {
|
|
tmp1 = tmp;
|
|
tmp2 = tmp1;
|
|
tmp3 = out;
|
|
} else if (do_rle) {
|
|
tmp1 = tmp;
|
|
tmp2 = out;
|
|
tmp3 = out;
|
|
}
|
|
} else {
|
|
// neither
|
|
tmp = NULL;
|
|
tmp1 = out;
|
|
tmp2 = out;
|
|
tmp3 = out;
|
|
}
|
|
|
|
// Decode the bit-packing map.
|
|
uint8_t map[16] = {0};
|
|
int npacked_sym = 0;
|
|
uint64_t unpacked_sz = 0; // FIXME: rename to packed_per_byte
|
|
if (do_pack) {
|
|
c_meta_size = hts_unpack_meta(in, in_size, *out_size, map, &npacked_sym);
|
|
if (c_meta_size == 0)
|
|
goto err;
|
|
|
|
unpacked_sz = osz;
|
|
in += c_meta_size;
|
|
in_size -= c_meta_size;
|
|
|
|
// New unpacked size. We could derive this bit from *out_size
|
|
// and npacked_sym.
|
|
unsigned int osz;
|
|
sz = var_get_u32(in, in_end, &osz);
|
|
in += sz;
|
|
in_size -= sz;
|
|
if (osz > tmp1_size)
|
|
goto err;
|
|
tmp1_size = osz;
|
|
}
|
|
|
|
uint8_t *meta = NULL;
|
|
uint32_t u_meta_size = 0;
|
|
if (do_rle) {
|
|
// Uncompress meta data
|
|
uint32_t c_meta_size, rle_len, sz;
|
|
sz = var_get_u32(in, in_end, &u_meta_size);
|
|
sz += var_get_u32(in+sz, in_end, &rle_len);
|
|
if (rle_len > tmp1_size) // should never grow
|
|
goto err;
|
|
if (u_meta_size & 1) {
|
|
meta = in + sz;
|
|
u_meta_size = u_meta_size/2 > (in_end-meta) ? (in_end-meta) : u_meta_size/2;
|
|
c_meta_size = u_meta_size;
|
|
} else {
|
|
sz += var_get_u32(in+sz, in_end, &c_meta_size);
|
|
u_meta_size /= 2;
|
|
|
|
meta_free = meta = rans_dec_func(do_simd, 0)(in+sz, in_size-sz, NULL, u_meta_size);
|
|
if (!meta)
|
|
goto err;
|
|
}
|
|
if (c_meta_size+sz > in_size)
|
|
goto err;
|
|
in += c_meta_size+sz;
|
|
in_size -= c_meta_size+sz;
|
|
tmp1_size = rle_len;
|
|
}
|
|
//fprintf(stderr, " meta_size %d bytes\n", (int)(in - orig_in)); //c-size
|
|
|
|
// uncompress RLE data. in -> tmp1
|
|
if (in_size) {
|
|
if (do_cat) {
|
|
//fprintf(stderr, " CAT %d\n", tmp1_size); //c-size
|
|
if (tmp1_size > in_size)
|
|
goto err;
|
|
if (tmp1_size > *out_size)
|
|
goto err;
|
|
memcpy(tmp1, in, tmp1_size);
|
|
} else {
|
|
tmp1 = rans_dec_func(do_simd, order)(in, in_size, tmp1, tmp1_size);
|
|
if (!tmp1)
|
|
goto err;
|
|
}
|
|
} else {
|
|
tmp1_size = 0;
|
|
}
|
|
tmp2_size = tmp3_size = tmp1_size;
|
|
|
|
if (do_rle) {
|
|
// Unpack RLE. tmp1 -> tmp2.
|
|
if (u_meta_size == 0)
|
|
goto err;
|
|
uint64_t unrle_size = *out_size;
|
|
int rle_nsyms = *meta ? *meta : 256;
|
|
if (u_meta_size < 1+rle_nsyms)
|
|
goto err;
|
|
if (!hts_rle_decode(tmp1, tmp1_size,
|
|
meta+1+rle_nsyms, u_meta_size-(1+rle_nsyms),
|
|
meta+1, rle_nsyms, tmp2, &unrle_size))
|
|
goto err;
|
|
tmp3_size = tmp2_size = unrle_size;
|
|
free(meta_free);
|
|
meta_free = NULL;
|
|
}
|
|
if (do_pack) {
|
|
// Unpack bits via pack-map. tmp2 -> tmp3
|
|
if (npacked_sym == 1)
|
|
unpacked_sz = tmp2_size;
|
|
//uint8_t *porig = unpack(tmp2, tmp2_size, unpacked_sz, npacked_sym, map);
|
|
//memcpy(tmp3, porig, unpacked_sz);
|
|
if (!hts_unpack(tmp2, tmp2_size, tmp3, unpacked_sz, npacked_sym, map))
|
|
goto err;
|
|
tmp3_size = unpacked_sz;
|
|
}
|
|
|
|
if (tmp)
|
|
free(tmp);
|
|
|
|
*out_size = tmp3_size;
|
|
return tmp3;
|
|
|
|
err:
|
|
free(meta_free);
|
|
free(out_free);
|
|
free(tmp_free);
|
|
return NULL;
|
|
}
|
|
|
|
unsigned char *rans_uncompress_4x16(unsigned char *in, unsigned int in_size,
|
|
unsigned int *out_size) {
|
|
return rans_uncompress_to_4x16(in, in_size, NULL, out_size);
|
|
}
|