208 lines
6.6 KiB
C
208 lines
6.6 KiB
C
/*
|
|
* Copyright (c) 2019-2021 Genome Research Ltd.
|
|
* Author(s): James Bonfield
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials provided
|
|
* with the distribution.
|
|
*
|
|
* 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
|
|
* Institute nor the names of its contributors may be used to endorse
|
|
* or promote products derived from this software without specific
|
|
* prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
|
|
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
|
|
* LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "config.h"
|
|
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
|
|
#include "varint.h"
|
|
#include "rle.h"
|
|
|
|
#define MAGIC 8
|
|
|
|
//-----------------------------------------------------------------------------
|
|
// Auto compute rle_syms / rle_nsyms
|
|
static void rle_find_syms(uint8_t *data, uint64_t data_len,
|
|
int64_t *saved, // dim >= 256
|
|
uint8_t *rle_syms, int *rle_nsyms) {
|
|
int last = -1, n;
|
|
uint64_t i;
|
|
|
|
if (data_len > 256) {
|
|
// 186/450
|
|
// Interleaved buffers to avoid cache collisions
|
|
int64_t saved2[256+MAGIC] = {0};
|
|
int64_t saved3[256+MAGIC] = {0};
|
|
int64_t saved4[256+MAGIC] = {0};
|
|
int64_t len4 = data_len&~3;
|
|
for (i = 0; i < len4; i+=4) {
|
|
int d1 = (data[i+0] == last) <<1;
|
|
int d2 = (data[i+1] == data[i+0])<<1;
|
|
int d3 = (data[i+2] == data[i+1])<<1;
|
|
int d4 = (data[i+3] == data[i+2])<<1;
|
|
last = data[i+3];
|
|
saved [data[i+0]] += d1-1;
|
|
saved2[data[i+1]] += d2-1;
|
|
saved3[data[i+2]] += d3-1;
|
|
saved4[data[i+3]] += d4-1;
|
|
}
|
|
while (i < data_len) {
|
|
int d = (data[i] == last)<<1;
|
|
saved[data[i]] += d - 1;
|
|
last = data[i];
|
|
i++;
|
|
}
|
|
for (i = 0; i < 256; i++)
|
|
saved[i] += saved2[i] + saved3[i] + saved4[i];
|
|
} else {
|
|
// 163/391
|
|
for (i = 0; i < data_len; i++) {
|
|
if (data[i] == last) {
|
|
saved[data[i]]++;
|
|
} else {
|
|
saved[data[i]]--;
|
|
last = data[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Map back to a list
|
|
for (i = n = 0; i < 256; i++) {
|
|
if (saved[i] > 0)
|
|
rle_syms[n++] = i;
|
|
}
|
|
*rle_nsyms = n;
|
|
}
|
|
|
|
uint8_t *hts_rle_encode(uint8_t *data, uint64_t data_len,
|
|
uint8_t *run, uint64_t *run_len,
|
|
uint8_t *rle_syms, int *rle_nsyms,
|
|
uint8_t *out, uint64_t *out_len) {
|
|
uint64_t i, j, k;
|
|
if (!out)
|
|
if (!(out = malloc(data_len*2)))
|
|
return NULL;
|
|
|
|
// Two pass: Firstly compute which symbols are worth using RLE on.
|
|
int64_t saved[256+MAGIC] = {0};
|
|
|
|
if (*rle_nsyms) {
|
|
for (i = 0; i < *rle_nsyms; i++)
|
|
saved[rle_syms[i]] = 1;
|
|
} else {
|
|
// Writes back to rle_syms and rle_nsyms
|
|
rle_find_syms(data, data_len, saved, rle_syms, rle_nsyms);
|
|
}
|
|
|
|
// 2nd pass: perform RLE itself to out[] and run[] arrays.
|
|
for (i = j = k = 0; i < data_len; i++) {
|
|
out[k++] = data[i];
|
|
if (saved[data[i]] > 0) {
|
|
int rlen = i;
|
|
int last = data[i];
|
|
while (i < data_len && data[i] == last)
|
|
i++;
|
|
i--;
|
|
rlen = i-rlen;
|
|
|
|
j += var_put_u32(&run[j], NULL, rlen);
|
|
}
|
|
}
|
|
|
|
*run_len = j;
|
|
*out_len = k;
|
|
return out;
|
|
}
|
|
|
|
// On input *out_len holds the allocated size of out[].
|
|
// On output it holds the used size of out[].
|
|
uint8_t *hts_rle_decode(uint8_t *lit, uint64_t lit_len,
|
|
uint8_t *run, uint64_t run_len,
|
|
uint8_t *rle_syms, int rle_nsyms,
|
|
uint8_t *out, uint64_t *out_len) {
|
|
uint64_t j;
|
|
uint8_t *run_end = run + run_len;
|
|
|
|
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
|
|
if (*out_len > 100000)
|
|
return NULL;
|
|
#endif
|
|
|
|
int saved[256] = {0};
|
|
for (j = 0; j < rle_nsyms; j++)
|
|
saved[rle_syms[j]] = 1;
|
|
|
|
uint8_t *lit_end = lit + lit_len;
|
|
uint8_t *out_end = out + *out_len;
|
|
uint8_t *outp = out;
|
|
|
|
while (lit < lit_end) {
|
|
if (outp >= out_end)
|
|
goto err;
|
|
|
|
uint8_t b = *lit;
|
|
if (saved[b]) {
|
|
uint32_t rlen;
|
|
run += var_get_u32(run, run_end, &rlen);
|
|
if (rlen) {
|
|
if (outp + rlen >= out_end)
|
|
goto err;
|
|
memset(outp, b, rlen+1);
|
|
outp += rlen+1;
|
|
} else {
|
|
*outp++ = b;
|
|
}
|
|
} else {
|
|
*outp++ = b;
|
|
}
|
|
lit++;
|
|
}
|
|
|
|
*out_len = outp-out;
|
|
return out;
|
|
|
|
err:
|
|
return NULL;
|
|
}
|
|
|
|
// Deprecated interface; to remove when we next to an ABI breakage
|
|
uint8_t *rle_encode(uint8_t *data, uint64_t data_len,
|
|
uint8_t *run, uint64_t *run_len,
|
|
uint8_t *rle_syms, int *rle_nsyms,
|
|
uint8_t *out, uint64_t *out_len) {
|
|
return hts_rle_encode(data, data_len, run, run_len,
|
|
rle_syms, rle_nsyms, out, out_len);
|
|
}
|
|
|
|
// Deprecated interface; to remove when we next to an ABI breakage
|
|
uint8_t *rle_decode(uint8_t *lit, uint64_t lit_len,
|
|
uint8_t *run, uint64_t run_len,
|
|
uint8_t *rle_syms, int rle_nsyms,
|
|
uint8_t *out, uint64_t *out_len) {
|
|
return hts_rle_decode(lit, lit_len, run, run_len,
|
|
rle_syms, rle_nsyms, out, out_len);
|
|
}
|