Merge branch 'dev'
This commit is contained in:
commit
25097a9a88
6
Makefile
6
Makefile
|
|
@ -5,7 +5,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
|||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
||||
AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
AOBJS= QSufSort.o bwt_gen.o rope.o rle.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
is.o bwtindex.o bwape.o kopen.o pemerge.o maxk.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
|
|
@ -63,7 +63,7 @@ bwt_gen.o: QSufSort.h malloc_wrap.h
|
|||
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
||||
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
|
||||
bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
|
||||
bwtindex.o: bntseq.h bwt.h utils.h rle.h rope.h malloc_wrap.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
|
||||
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
|
||||
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
|
||||
|
|
@ -82,4 +82,6 @@ main.o: kstring.h malloc_wrap.h utils.h
|
|||
malloc_wrap.o: malloc_wrap.h
|
||||
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
|
||||
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
|
||||
rle.o: rle.h
|
||||
rope.o: rle.h rope.h
|
||||
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
|
||||
|
|
|
|||
7
bwa.1
7
bwa.1
|
|
@ -60,11 +60,12 @@ Index database sequences in the FASTA format.
|
|||
Prefix of the output database [same as db filename]
|
||||
.TP
|
||||
.BI -a \ STR
|
||||
Algorithm for constructing BWT index. BWA implements two algorithms for BWT
|
||||
Algorithm for constructing BWT index. BWA implements three algorithms for BWT
|
||||
construction:
|
||||
.B is
|
||||
.BR is ,
|
||||
.B bwtsw
|
||||
and
|
||||
.BR bwtsw .
|
||||
.BR rb2 .
|
||||
The first algorithm is a little faster for small database but requires large
|
||||
RAM and does not work for databases with total length longer than 2GB. The
|
||||
second algorithm is adapted from the BWT-SW source code. It in theory works
|
||||
|
|
|
|||
19
bwamem.c
19
bwamem.c
|
|
@ -114,7 +114,7 @@ static void smem_aux_destroy(smem_aux_t *a)
|
|||
static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)
|
||||
{
|
||||
int i, k, x = 0, old_n;
|
||||
int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;
|
||||
int start_width = 1;
|
||||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||||
a->mem.n = 0;
|
||||
// first pass: find all SMEMs
|
||||
|
|
@ -488,13 +488,6 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_
|
|||
return m;
|
||||
}
|
||||
|
||||
int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen)
|
||||
{
|
||||
if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n;
|
||||
memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t));
|
||||
return n - 1;
|
||||
}
|
||||
|
||||
typedef kvec_t(int) int_v;
|
||||
|
||||
static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
|
||||
|
|
@ -1046,8 +1039,6 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
|
|||
}
|
||||
free(chn.a);
|
||||
regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
|
||||
if (opt->flag & MEM_F_SELF_OVLP)
|
||||
regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
|
||||
if (bwa_verbose >= 4) {
|
||||
err_printf("* %ld chains remain after removing duplicated chains\n", regs.n);
|
||||
for (i = 0; i < regs.n; ++i) {
|
||||
|
|
@ -1168,12 +1159,8 @@ static void worker2(void *data, int i, int tid)
|
|||
worker_t *w = (worker_t*)data;
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
|
||||
if (w->opt->flag & MEM_F_ALN_REG) {
|
||||
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
|
||||
} else {
|
||||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
||||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
|
||||
}
|
||||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
||||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
|
||||
free(w->regs[i].a);
|
||||
} else {
|
||||
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
|
||||
|
|
|
|||
2
bwamem.h
2
bwamem.h
|
|
@ -16,8 +16,6 @@ typedef struct __smem_i smem_i;
|
|||
#define MEM_F_ALL 0x8
|
||||
#define MEM_F_NO_MULTI 0x10
|
||||
#define MEM_F_NO_RESCUE 0x20
|
||||
#define MEM_F_SELF_OVLP 0x40
|
||||
#define MEM_F_ALN_REG 0x80
|
||||
#define MEM_F_REF_HDR 0x100
|
||||
#define MEM_F_SOFTCLIP 0x200
|
||||
#define MEM_F_SMARTPE 0x400
|
||||
|
|
|
|||
|
|
@ -87,31 +87,6 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
|
|||
return ar;
|
||||
}
|
||||
|
||||
void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a)
|
||||
{
|
||||
int i;
|
||||
kstring_t str = {0,0,0};
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
const mem_alnreg_t *p = &a->a[i];
|
||||
int is_rev, rid, qb = p->qb, qe = p->qe;
|
||||
int64_t pos, rb = p->rb, re = p->re;
|
||||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||||
rid = bns_pos2rid(bns, pos);
|
||||
assert(rid == p->rid);
|
||||
pos -= bns->anns[rid].offset;
|
||||
kputs(s->name, &str); kputc('\t', &str);
|
||||
kputw(s->l_seq, &str); kputc('\t', &str);
|
||||
if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap
|
||||
kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str);
|
||||
kputs(bns->anns[rid].name, &str); kputc('\t', &str);
|
||||
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
|
||||
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
|
||||
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
|
||||
kputc('\n', &str);
|
||||
}
|
||||
s->sam = str.s;
|
||||
}
|
||||
|
||||
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
|
||||
{
|
||||
int k = a[i].secondary_all;
|
||||
|
|
|
|||
38
bwtindex.c
38
bwtindex.c
|
|
@ -34,6 +34,8 @@
|
|||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "utils.h"
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
|
|
@ -63,7 +65,7 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
|||
{
|
||||
bwt_t *bwt;
|
||||
ubyte_t *buf, *buf2;
|
||||
int i, pac_size;
|
||||
int64_t i, pac_size;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
|
|
@ -90,11 +92,31 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
|||
if (use_is) {
|
||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||
} else {
|
||||
#ifdef _DIVBWT
|
||||
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
||||
#else
|
||||
err_fatal_simple("libdivsufsort is not compiled in.");
|
||||
#endif
|
||||
rope_t *r;
|
||||
int64_t x;
|
||||
rpitr_t itr;
|
||||
const uint8_t *blk;
|
||||
|
||||
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
|
||||
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
|
||||
int c = buf[i] + 1;
|
||||
x = rope_insert_run(r, x, c, 1, 0) + 1;
|
||||
while (--c >= 0) x += r->c[c];
|
||||
}
|
||||
bwt->primary = x;
|
||||
rope_itr_first(r, &itr);
|
||||
x = 0;
|
||||
while ((blk = rope_itr_next_block(&itr)) != 0) {
|
||||
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
|
||||
while (q < end) {
|
||||
int c = 0;
|
||||
int64_t l;
|
||||
rle_dec1(q, c, l);
|
||||
for (i = 0; i < l; ++i)
|
||||
buf[x++] = c - 1;
|
||||
}
|
||||
}
|
||||
rope_destroy(r);
|
||||
}
|
||||
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
|
||||
for (i = 0; i < bwt->seq_len; ++i)
|
||||
|
|
@ -196,7 +218,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': // if -a is not set, algo_type will be determined later
|
||||
if (strcmp(optarg, "div") == 0) algo_type = 1;
|
||||
if (strcmp(optarg, "rb2") == 0) algo_type = 1;
|
||||
else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
|
||||
else if (strcmp(optarg, "is") == 0) algo_type = 3;
|
||||
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
||||
|
|
@ -216,7 +238,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
|
||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
|
||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||
|
|
|
|||
20
fastmap.c
20
fastmap.c
|
|
@ -130,7 +130,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
aux.opt = opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "1epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "1paMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == '1') no_mt_io = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
|
|
@ -145,8 +145,6 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
||||
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
|
||||
else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
|
||||
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
|
||||
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
|
||||
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
|
||||
|
|
@ -251,7 +249,6 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
|
||||
fprintf(stderr, " -S skip mate rescue\n");
|
||||
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
|
||||
fprintf(stderr, " -e discard full-length exact matches\n");
|
||||
fprintf(stderr, "\nScoring options:\n\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
|
|
@ -263,7 +260,6 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
|
||||
fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
|
||||
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
|
||||
// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
|
|
@ -296,21 +292,14 @@ int main_mem(int argc, char *argv[])
|
|||
if (!opt0.b) opt->b = 9;
|
||||
if (!opt0.pen_clip5) opt->pen_clip5 = 5;
|
||||
if (!opt0.pen_clip3) opt->pen_clip3 = 5;
|
||||
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) {
|
||||
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
|
||||
if (!opt0.o_del) opt->o_del = 1;
|
||||
if (!opt0.e_del) opt->e_del = 1;
|
||||
if (!opt0.o_ins) opt->o_ins = 1;
|
||||
if (!opt0.e_ins) opt->e_ins = 1;
|
||||
if (!opt0.b) opt->b = 1;
|
||||
if (opt0.split_factor == 0.) opt->split_factor = 10.;
|
||||
if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well!
|
||||
opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG;
|
||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
|
||||
if (!opt0.max_occ) opt->max_occ = 1000;
|
||||
if (!opt0.min_seed_len) opt->min_seed_len = 13;
|
||||
if (!opt0.max_chain_extend) opt->max_chain_extend = 25;
|
||||
if (opt0.drop_ratio == 0.) opt->drop_ratio = .001;
|
||||
} else if (strcmp(mode, "ont2d") == 0) {
|
||||
if (strcmp(mode, "ont2d") == 0) {
|
||||
if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
|
||||
if (!opt0.min_seed_len) opt->min_seed_len = 14;
|
||||
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
|
||||
|
|
@ -359,8 +348,7 @@ int main_mem(int argc, char *argv[])
|
|||
opt->flag |= MEM_F_PE;
|
||||
}
|
||||
}
|
||||
if (!(opt->flag & MEM_F_ALN_REG))
|
||||
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
||||
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
|
||||
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
||||
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
|
||||
free(hdr_line);
|
||||
|
|
|
|||
|
|
@ -130,14 +130,14 @@ void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_d
|
|||
pthread_mutex_init(&aux.mutex, 0);
|
||||
pthread_cond_init(&aux.cv, 0);
|
||||
|
||||
aux.workers = alloca(n_threads * sizeof(ktp_worker_t));
|
||||
aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
|
||||
for (i = 0; i < n_threads; ++i) {
|
||||
ktp_worker_t *w = &aux.workers[i];
|
||||
w->step = 0; w->pl = &aux; w->data = 0;
|
||||
w->index = aux.index++;
|
||||
}
|
||||
|
||||
tid = alloca(n_threads * sizeof(pthread_t));
|
||||
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
|
||||
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||
|
||||
|
|
|
|||
1
maxk.c
1
maxk.c
|
|
@ -3,6 +3,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "kseq.h"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,191 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "rle.h"
|
||||
|
||||
const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
|
||||
|
||||
// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
|
||||
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
|
||||
{
|
||||
uint16_t *nptr = (uint16_t*)block;
|
||||
int diff;
|
||||
|
||||
block += 2; // skip the first 2 counting bytes
|
||||
if (*nptr == 0) {
|
||||
memset(cnt, 0, 48);
|
||||
diff = rle_enc1(block, a, rl);
|
||||
} else {
|
||||
uint8_t *p, *end = block + *nptr, *q;
|
||||
int64_t pre, z, l = 0, tot, beg_l;
|
||||
int c = -1, n_bytes = 0, n_bytes2, t = 0;
|
||||
uint8_t tmp[24];
|
||||
beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
|
||||
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
|
||||
if (x < beg_l) {
|
||||
beg_l = 0, *beg = 0;
|
||||
memset(bc, 0, 48);
|
||||
}
|
||||
if (x == beg_l) {
|
||||
p = q = block + (*beg); z = beg_l;
|
||||
memcpy(cnt, bc, 48);
|
||||
} else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
|
||||
z = beg_l; p = block + (*beg);
|
||||
memcpy(cnt, bc, 48);
|
||||
while (z < x) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (q = p - 1; *q>>6 == 2; --q);
|
||||
} else { // backward
|
||||
memcpy(cnt, ec, 48);
|
||||
z = tot; p = end;
|
||||
while (z >= x) {
|
||||
--p;
|
||||
if (*p>>6 != 2) {
|
||||
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
|
||||
z -= l; cnt[*p&7] -= l;
|
||||
l = 0; t = 0;
|
||||
} else {
|
||||
l |= (*p&0x3fL) << t;
|
||||
t += 6;
|
||||
}
|
||||
}
|
||||
q = p;
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
*beg = q - block;
|
||||
memcpy(bc, cnt, 48);
|
||||
bc[c] -= l;
|
||||
n_bytes = p - q;
|
||||
if (x == z && a != c && p < end) { // then try the next run
|
||||
int tc;
|
||||
int64_t tl;
|
||||
q = p;
|
||||
rle_dec1(q, tc, tl);
|
||||
if (a == tc)
|
||||
c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
|
||||
}
|
||||
if (z != x) cnt[c] -= z - x;
|
||||
pre = x - (z - l); p -= n_bytes;
|
||||
if (a == c) { // insert to the same run
|
||||
n_bytes2 = rle_enc1(tmp, c, l + rl);
|
||||
} else if (x == z) { // at the end; append to the existing run
|
||||
p += n_bytes; n_bytes = 0;
|
||||
n_bytes2 = rle_enc1(tmp, a, rl);
|
||||
} else { // break the current run
|
||||
n_bytes2 = rle_enc1(tmp, c, pre);
|
||||
n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
|
||||
n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
|
||||
}
|
||||
if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
|
||||
memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
|
||||
memcpy(p, tmp, n_bytes2);
|
||||
diff = n_bytes2 - n_bytes;
|
||||
}
|
||||
return (*nptr += diff);
|
||||
}
|
||||
|
||||
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
|
||||
{
|
||||
int beg = 0;
|
||||
int64_t bc[6];
|
||||
memset(bc, 0, 48);
|
||||
return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
|
||||
}
|
||||
|
||||
void rle_split(uint8_t *block, uint8_t *new_block)
|
||||
{
|
||||
int n = *(uint16_t*)block;
|
||||
uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
|
||||
while (*q>>6 == 2) --q;
|
||||
memcpy(new_block + 2, q, end - q);
|
||||
*(uint16_t*)new_block = end - q;
|
||||
*(uint16_t*)block = q - block - 2;
|
||||
}
|
||||
|
||||
void rle_count(const uint8_t *block, int64_t cnt[6])
|
||||
{
|
||||
const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
|
||||
while (q < end) {
|
||||
int c;
|
||||
int64_t l;
|
||||
rle_dec1(q, c, l);
|
||||
cnt[c] += l;
|
||||
}
|
||||
}
|
||||
|
||||
void rle_print(const uint8_t *block, int expand)
|
||||
{
|
||||
const uint16_t *p = (const uint16_t*)block;
|
||||
const uint8_t *q = block + 2, *end = block + 2 + *p;
|
||||
while (q < end) {
|
||||
int c;
|
||||
int64_t l, x;
|
||||
rle_dec1(q, c, l);
|
||||
if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
|
||||
else printf("%c%ld", "$ACGTN"[c], (long)l);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
|
||||
{
|
||||
int a;
|
||||
int64_t tot, cnt[6];
|
||||
const uint8_t *p;
|
||||
|
||||
y = y >= x? y : x;
|
||||
tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
|
||||
if (tot == 0) return;
|
||||
if (x <= (tot - y) + (tot>>3)) {
|
||||
int c = 0;
|
||||
int64_t l, z = 0;
|
||||
memset(cnt, 0, 48);
|
||||
p = block + 2;
|
||||
while (z < x) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
|
||||
cx[c] -= z - x;
|
||||
if (cy) {
|
||||
while (z < y) {
|
||||
rle_dec1(p, c, l);
|
||||
z += l; cnt[c] += l;
|
||||
}
|
||||
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
|
||||
cy[c] -= z - y;
|
||||
}
|
||||
} else {
|
||||
#define move_backward(_x) \
|
||||
while (z >= (_x)) { \
|
||||
--p; \
|
||||
if (*p>>6 != 2) { \
|
||||
l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
|
||||
z -= l; cnt[*p&7] -= l; \
|
||||
l = 0; t = 0; \
|
||||
} else { \
|
||||
l |= (*p&0x3fL) << t; \
|
||||
t += 6; \
|
||||
} \
|
||||
} \
|
||||
|
||||
int t = 0;
|
||||
int64_t l = 0, z = tot;
|
||||
memcpy(cnt, ec, 48);
|
||||
p = block + 2 + *(const uint16_t*)block;
|
||||
if (cy) {
|
||||
move_backward(y)
|
||||
for (a = 0; a != 6; ++a) cy[a] += cnt[a];
|
||||
cy[*p&7] += y - z;
|
||||
}
|
||||
move_backward(x)
|
||||
for (a = 0; a != 6; ++a) cx[a] += cnt[a];
|
||||
cx[*p&7] += x - z;
|
||||
|
||||
#undef move_backward
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
#ifndef RLE6_H_
|
||||
#define RLE6_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define LIKELY(x) __builtin_expect((x),1)
|
||||
#else
|
||||
#define LIKELY(x) (x)
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
|
||||
int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
|
||||
void rle_split(uint8_t *block, uint8_t *new_block);
|
||||
void rle_count(const uint8_t *block, int64_t cnt[6]);
|
||||
void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
|
||||
#define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
|
||||
|
||||
void rle_print(const uint8_t *block, int expand);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/******************
|
||||
*** 43+3 codec ***
|
||||
******************/
|
||||
|
||||
const uint8_t rle_auxtab[8];
|
||||
|
||||
#define RLE_MIN_SPACE 18
|
||||
#define rle_nptr(block) ((uint16_t*)(block))
|
||||
|
||||
// decode one run (c,l) and move the pointer p
|
||||
#define rle_dec1(p, c, l) do { \
|
||||
(c) = *(p) & 7; \
|
||||
if (LIKELY((*(p)&0x80) == 0)) { \
|
||||
(l) = *(p)++ >> 3; \
|
||||
} else if (LIKELY(*(p)>>5 == 6)) { \
|
||||
(l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
|
||||
(p) += 2; \
|
||||
} else { \
|
||||
int n = ((*(p)&0x10) >> 2) + 4; \
|
||||
(l) = *(p)++ >> 3 & 1; \
|
||||
while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static inline int rle_enc1(uint8_t *p, int c, int64_t l)
|
||||
{
|
||||
if (l < 1LL<<4) {
|
||||
*p = l << 3 | c;
|
||||
return 1;
|
||||
} else if (l < 1LL<<8) {
|
||||
*p = 0xC0 | l >> 6 << 3 | c;
|
||||
p[1] = 0x80 | (l & 0x3f);
|
||||
return 2;
|
||||
} else if (l < 1LL<<19) {
|
||||
*p = 0xE0 | l >> 18 << 3 | c;
|
||||
p[1] = 0x80 | (l >> 12 & 0x3f);
|
||||
p[2] = 0x80 | (l >> 6 & 0x3f);
|
||||
p[3] = 0x80 | (l & 0x3f);
|
||||
return 4;
|
||||
} else {
|
||||
int i, shift = 36;
|
||||
*p = 0xF0 | l >> 42 << 3 | c;
|
||||
for (i = 1; i < 8; ++i, shift -= 6)
|
||||
p[i] = 0x80 | (l>>shift & 0x3f);
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,318 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include "rle.h"
|
||||
#include "rope.h"
|
||||
|
||||
/*******************
|
||||
*** Memory Pool ***
|
||||
*******************/
|
||||
|
||||
#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
|
||||
|
||||
typedef struct { // memory pool for fast and compact memory allocation (no free)
|
||||
int size, i, n_elems;
|
||||
int64_t top, max;
|
||||
uint8_t **mem;
|
||||
} mempool_t;
|
||||
|
||||
static mempool_t *mp_init(int size)
|
||||
{
|
||||
mempool_t *mp;
|
||||
mp = calloc(1, sizeof(mempool_t));
|
||||
mp->size = size;
|
||||
mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
|
||||
mp->top = -1;
|
||||
return mp;
|
||||
}
|
||||
|
||||
static void mp_destroy(mempool_t *mp)
|
||||
{
|
||||
int64_t i;
|
||||
for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
|
||||
free(mp->mem); free(mp);
|
||||
}
|
||||
|
||||
static inline void *mp_alloc(mempool_t *mp)
|
||||
{
|
||||
if (mp->i == mp->n_elems) {
|
||||
if (++mp->top == mp->max) {
|
||||
mp->max = mp->max? mp->max<<1 : 1;
|
||||
mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
|
||||
}
|
||||
mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
|
||||
mp->i = 0;
|
||||
}
|
||||
return mp->mem[mp->top] + (mp->i++) * mp->size;
|
||||
}
|
||||
|
||||
/***************
|
||||
*** B+ rope ***
|
||||
***************/
|
||||
|
||||
rope_t *rope_init(int max_nodes, int block_len)
|
||||
{
|
||||
rope_t *rope;
|
||||
rope = calloc(1, sizeof(rope_t));
|
||||
if (block_len < 32) block_len = 32;
|
||||
rope->max_nodes = (max_nodes+ 1)>>1<<1;
|
||||
rope->block_len = (block_len + 7) >> 3 << 3;
|
||||
rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
|
||||
rope->leaf = mp_init(rope->block_len);
|
||||
rope->root = mp_alloc(rope->node);
|
||||
rope->root->n = 1;
|
||||
rope->root->is_bottom = 1;
|
||||
rope->root->p = mp_alloc(rope->leaf);
|
||||
return rope;
|
||||
}
|
||||
|
||||
void rope_destroy(rope_t *rope)
|
||||
{
|
||||
mp_destroy(rope->node);
|
||||
mp_destroy(rope->leaf);
|
||||
free(rope);
|
||||
}
|
||||
|
||||
static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
|
||||
{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
|
||||
int j, i = v - u;
|
||||
rpnode_t *w; // $w is the sibling of $v
|
||||
if (u == 0) { // only happens at the root; add a new root
|
||||
u = v = mp_alloc(rope->node);
|
||||
v->n = 1; v->p = rope->root; // the new root has the old root as the only child
|
||||
memcpy(v->c, rope->c, 48);
|
||||
for (j = 0; j < 6; ++j) v->l += v->c[j];
|
||||
rope->root = v;
|
||||
}
|
||||
if (i != u->n - 1) // then make room for a new node
|
||||
memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
|
||||
++u->n; w = v + 1;
|
||||
memset(w, 0, sizeof(rpnode_t));
|
||||
w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
|
||||
if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
|
||||
uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
|
||||
rle_split(p, q);
|
||||
rle_count(q, w->c);
|
||||
} else { // $v->p is a node, not a string
|
||||
rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
|
||||
p->n -= rope->max_nodes>>1;
|
||||
memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
|
||||
q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
|
||||
q->is_bottom = p->is_bottom;
|
||||
for (i = 0; i < q->n; ++i)
|
||||
for (j = 0; j < 6; ++j)
|
||||
w->c[j] += q[i].c[j];
|
||||
}
|
||||
for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
|
||||
w->l += w->c[j], v->c[j] -= w->c[j];
|
||||
v->l -= w->l; // update $v->c
|
||||
return v;
|
||||
}
|
||||
|
||||
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
|
||||
{ // insert $a after $x symbols in $rope and the returns rank(a, x)
|
||||
rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
|
||||
int64_t y = 0, z = 0, cnt[6];
|
||||
int n_runs;
|
||||
do { // top-down update. Searching and node splitting are done together in one pass.
|
||||
if (p->n == rope->max_nodes) { // node is full; split
|
||||
v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
|
||||
if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
|
||||
y += v->l, z += v->c[a], ++v, p = v->p;
|
||||
}
|
||||
u = p;
|
||||
if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
|
||||
p += p->n - 1; y += v->l; z += v->c[a];
|
||||
for (; y >= x; --p) y -= p->l, z -= p->c[a];
|
||||
++p;
|
||||
} else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
|
||||
assert(p - u < u->n);
|
||||
if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
|
||||
v = p; p = p->p; // descend
|
||||
} while (!u->is_bottom);
|
||||
rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
|
||||
if (cache) {
|
||||
if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
|
||||
n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
|
||||
cache->p = (uint8_t*)p;
|
||||
} else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
|
||||
z += cnt[a];
|
||||
v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
|
||||
if (n_runs + RLE_MIN_SPACE > rope->block_len) {
|
||||
split_node(rope, u, v);
|
||||
if (cache) memset(cache, 0, sizeof(rpcache_t));
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
|
||||
{
|
||||
rpnode_t *u, *v = 0, *p = rope->root;
|
||||
int64_t y = 0;
|
||||
int a;
|
||||
|
||||
memset(cx, 0, 48);
|
||||
do {
|
||||
u = p;
|
||||
if (v && x - y > v->l>>1) {
|
||||
p += p->n - 1; y += v->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] += v->c[a];
|
||||
for (; y >= x; --p) {
|
||||
y -= p->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
|
||||
}
|
||||
++p;
|
||||
} else {
|
||||
for (; y + p->l < x; ++p) {
|
||||
y += p->l;
|
||||
for (a = 0; a != 6; ++a) cx[a] += p->c[a];
|
||||
}
|
||||
}
|
||||
v = p; p = p->p;
|
||||
} while (!u->is_bottom);
|
||||
*rest = x - y;
|
||||
return v;
|
||||
}
|
||||
|
||||
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
|
||||
{
|
||||
rpnode_t *v;
|
||||
int64_t rest;
|
||||
v = rope_count_to_leaf(rope, x, cx, &rest);
|
||||
if (y < x || cy == 0) {
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
|
||||
} else if (rest + (y - x) <= v->l) {
|
||||
memcpy(cy, cx, 48);
|
||||
rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
|
||||
} else {
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
|
||||
v = rope_count_to_leaf(rope, y, cy, &rest);
|
||||
rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
|
||||
}
|
||||
}
|
||||
|
||||
/*********************
|
||||
*** Rope iterator ***
|
||||
*********************/
|
||||
|
||||
void rope_itr_first(const rope_t *rope, rpitr_t *i)
|
||||
{
|
||||
memset(i, 0, sizeof(rpitr_t));
|
||||
i->rope = rope;
|
||||
for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
|
||||
++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
|
||||
}
|
||||
|
||||
const uint8_t *rope_itr_next_block(rpitr_t *i)
|
||||
{
|
||||
const uint8_t *ret;
|
||||
assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
|
||||
if (i->d < 0) return 0;
|
||||
ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
|
||||
while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
|
||||
if (i->d >= 0)
|
||||
while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
|
||||
++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/***********
|
||||
*** I/O ***
|
||||
***********/
|
||||
|
||||
void rope_print_node(const rpnode_t *p)
|
||||
{
|
||||
if (p->is_bottom) {
|
||||
int i;
|
||||
putchar('(');
|
||||
for (i = 0; i < p->n; ++i) {
|
||||
uint8_t *block = (uint8_t*)p[i].p;
|
||||
const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
|
||||
if (i) putchar(',');
|
||||
while (q < end) {
|
||||
int c = 0;
|
||||
int64_t j, l;
|
||||
rle_dec1(q, c, l);
|
||||
for (j = 0; j < l; ++j) putchar("$ACGTN"[c]);
|
||||
}
|
||||
}
|
||||
putchar(')');
|
||||
} else {
|
||||
int i;
|
||||
putchar('(');
|
||||
for (i = 0; i < p->n; ++i) {
|
||||
if (i) putchar(',');
|
||||
rope_print_node(p[i].p);
|
||||
}
|
||||
putchar(')');
|
||||
}
|
||||
}
|
||||
|
||||
void rope_dump_node(const rpnode_t *p, FILE *fp)
|
||||
{
|
||||
int16_t i, n = p->n;
|
||||
uint8_t is_bottom = p->is_bottom;
|
||||
fwrite(&is_bottom, 1, 1, fp);
|
||||
fwrite(&n, 2, 1, fp);
|
||||
if (is_bottom) {
|
||||
for (i = 0; i < n; ++i) {
|
||||
fwrite(p[i].c, 8, 6, fp);
|
||||
fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < p->n; ++i)
|
||||
rope_dump_node(p[i].p, fp);
|
||||
}
|
||||
}
|
||||
|
||||
void rope_dump(const rope_t *r, FILE *fp)
|
||||
{
|
||||
fwrite(&r->max_nodes, 4, 1, fp);
|
||||
fwrite(&r->block_len, 4, 1, fp);
|
||||
rope_dump_node(r->root, fp);
|
||||
}
|
||||
|
||||
rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6])
|
||||
{
|
||||
uint8_t is_bottom, a;
|
||||
int16_t i, n;
|
||||
rpnode_t *p;
|
||||
fread(&is_bottom, 1, 1, fp);
|
||||
fread(&n, 2, 1, fp);
|
||||
p = mp_alloc(r->node);
|
||||
p->is_bottom = is_bottom, p->n = n;
|
||||
if (is_bottom) {
|
||||
for (i = 0; i < n; ++i) {
|
||||
uint16_t *q;
|
||||
p[i].p = mp_alloc(r->leaf);
|
||||
q = rle_nptr(p[i].p);
|
||||
fread(p[i].c, 8, 6, fp);
|
||||
fread(q, 2, 1, fp);
|
||||
fread(q + 1, 1, *q, fp);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; ++i)
|
||||
p[i].p = rope_restore_node(r, fp, p[i].c);
|
||||
}
|
||||
memset(c, 0, 48);
|
||||
for (i = 0; i < n; ++i) {
|
||||
p[i].l = 0;
|
||||
for (a = 0; a < 6; ++a)
|
||||
c[a] += p[i].c[a], p[i].l += p[i].c[a];
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
rope_t *rope_restore(FILE *fp)
|
||||
{
|
||||
rope_t *r;
|
||||
r = calloc(1, sizeof(rope_t));
|
||||
fread(&r->max_nodes, 4, 1, fp);
|
||||
fread(&r->block_len, 4, 1, fp);
|
||||
r->node = mp_init(sizeof(rpnode_t) * r->max_nodes);
|
||||
r->leaf = mp_init(r->block_len);
|
||||
r->root = rope_restore_node(r, fp, r->c);
|
||||
return r;
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
#ifndef ROPE_H_
|
||||
#define ROPE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define ROPE_MAX_DEPTH 80
|
||||
#define ROPE_DEF_MAX_NODES 64
|
||||
#define ROPE_DEF_BLOCK_LEN 512
|
||||
|
||||
typedef struct rpnode_s {
|
||||
struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
|
||||
uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
|
||||
int64_t c[6]; // marginal counts
|
||||
} rpnode_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t max_nodes, block_len; // both MUST BE even numbers
|
||||
int64_t c[6]; // marginal counts
|
||||
rpnode_t *root;
|
||||
void *node, *leaf; // memory pool
|
||||
} rope_t;
|
||||
|
||||
typedef struct {
|
||||
const rope_t *rope; // the rope
|
||||
const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
|
||||
int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
|
||||
int d; // the current depth in the B+-tree
|
||||
} rpitr_t;
|
||||
|
||||
typedef struct {
|
||||
int beg;
|
||||
int64_t bc[6];
|
||||
uint8_t *p;
|
||||
} rpcache_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
rope_t *rope_init(int max_nodes, int block_len);
|
||||
void rope_destroy(rope_t *rope);
|
||||
int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache);
|
||||
void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
|
||||
#define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
|
||||
|
||||
void rope_itr_first(const rope_t *rope, rpitr_t *i);
|
||||
const uint8_t *rope_itr_next_block(rpitr_t *i);
|
||||
|
||||
void rope_print_node(const rpnode_t *p);
|
||||
void rope_dump(const rope_t *r, FILE *fp);
|
||||
rope_t *rope_restore(FILE *fp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Loading…
Reference in New Issue