index can be compiled; not tested yet
This commit is contained in:
parent
0f160151c7
commit
b3bc4911ba
|
|
@ -0,0 +1,41 @@
|
|||
CC= gcc
|
||||
CFLAGS= -g -Wall -O2 -Wc++-compat -Wno-unused-function
|
||||
CPPFLAGS=
|
||||
INCLUDES= -I.
|
||||
OBJS= kalloc.o kthread.o bseq.o sketch.o sdust.o index.o
|
||||
PROG= minimap2
|
||||
PROG_EXTRA= sdust
|
||||
LIBS= -lm -lz -lpthread
|
||||
|
||||
.SUFFIXES:.c .o
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@
|
||||
|
||||
all:$(PROG)
|
||||
|
||||
extra:all $(PROG_EXTRA)
|
||||
|
||||
minimap2:libminimap2.a
|
||||
$(CC) $(CFLAGS) $< -o $@ -L. -lminimap2 $(LIBS)
|
||||
|
||||
libminimap2.a:$(OBJS)
|
||||
$(AR) -csru $@ $(OBJS)
|
||||
|
||||
sdust:sdust.c kalloc.o kdq.h kvec.h kseq.h sdust.h
|
||||
$(CC) -D_SDUST_MAIN $(CFLAGS) $< kalloc.o -o $@ -lz
|
||||
|
||||
clean:
|
||||
rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session*
|
||||
|
||||
depend:
|
||||
(LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c)
|
||||
|
||||
# DO NOT DELETE
|
||||
|
||||
bseq.o: bseq.h kseq.h
|
||||
index.o: kthread.h bseq.h minimap.h kvec.h kalloc.h khash.h
|
||||
kalloc.o: kalloc.h
|
||||
misc.o: minimap.h ksort.h
|
||||
sdust.o: kalloc.h kdq.h kvec.h sdust.h
|
||||
sketch.o: kvec.h kalloc.h minimap.h
|
||||
2
bseq.c
2
bseq.c
|
|
@ -7,8 +7,6 @@
|
|||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, gzread)
|
||||
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
||||
struct bseq_file_s {
|
||||
int is_eof;
|
||||
gzFile fp;
|
||||
|
|
|
|||
2
bseq.h
2
bseq.h
|
|
@ -16,4 +16,6 @@ void bseq_close(bseq_file_t *fp);
|
|||
bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_);
|
||||
int bseq_eof(bseq_file_t *fp);
|
||||
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,355 @@
|
|||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "kthread.h"
|
||||
#include "bseq.h"
|
||||
#include "minimap.h"
|
||||
#include "kvec.h"
|
||||
#include "khash.h"
|
||||
|
||||
#define idx_hash(a) ((a)>>1)
|
||||
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
|
||||
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
|
||||
typedef khash_t(idx) idxhash_t;
|
||||
|
||||
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
|
||||
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b)
|
||||
{
|
||||
mm_idx_t *mi;
|
||||
if (k*2 < b) b = k * 2;
|
||||
if (w < 1) w = 1;
|
||||
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
|
||||
mi->w = w, mi->k = k, mi->b = b;
|
||||
mi->B = (mm_idx_bucket_t*)calloc(1<<b, sizeof(mm_idx_bucket_t));
|
||||
return mi;
|
||||
}
|
||||
|
||||
void mm_idx_destroy(mm_idx_t *mi)
|
||||
{
|
||||
int i;
|
||||
if (mi == 0) return;
|
||||
for (i = 0; i < 1<<mi->b; ++i) {
|
||||
free(mi->B[i].p);
|
||||
free(mi->B[i].a.a);
|
||||
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
|
||||
}
|
||||
for (i = 0; i < mi->n_seq; ++i)
|
||||
free(mi->seq[i].name);
|
||||
free(mi->seq); free(mi->B); free(mi);
|
||||
}
|
||||
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n)
|
||||
{
|
||||
int mask = (1<<mi->b) - 1;
|
||||
khint_t k;
|
||||
mm_idx_bucket_t *b = &mi->B[minier&mask];
|
||||
idxhash_t *h = (idxhash_t*)b->h;
|
||||
*n = 0;
|
||||
if (h == 0) return 0;
|
||||
k = kh_get(idx, h, minier>>mi->b<<1);
|
||||
if (k == kh_end(h)) return 0;
|
||||
if (kh_key(h, k)&1) {
|
||||
*n = 1;
|
||||
return &kh_val(h, k);
|
||||
} else {
|
||||
*n = (uint32_t)kh_val(h, k);
|
||||
return &b->p[kh_val(h, k)>>32];
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
|
||||
{
|
||||
int i;
|
||||
size_t n = 0;
|
||||
uint32_t thres;
|
||||
khint_t *a, k;
|
||||
if (f <= 0.) return UINT32_MAX;
|
||||
for (i = 0; i < 1<<mi->b; ++i)
|
||||
if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h);
|
||||
a = (uint32_t*)malloc(n * 4);
|
||||
for (i = n = 0; i < 1<<mi->b; ++i) {
|
||||
idxhash_t *h = (idxhash_t*)mi->B[i].h;
|
||||
if (h == 0) continue;
|
||||
for (k = 0; k < kh_end(h); ++k) {
|
||||
if (!kh_exist(h, k)) continue;
|
||||
a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k);
|
||||
}
|
||||
}
|
||||
thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1;
|
||||
free(a);
|
||||
return thres;
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Sort and generate hash tables *
|
||||
*********************************/
|
||||
|
||||
static void worker_post(void *g, long i, int tid)
|
||||
{
|
||||
int j, start_a, start_p, n, n_keys;
|
||||
idxhash_t *h;
|
||||
mm_idx_t *mi = (mm_idx_t*)g;
|
||||
mm_idx_bucket_t *b = &mi->B[i];
|
||||
if (b->a.n == 0) return;
|
||||
|
||||
// sort by minimizer
|
||||
radix_sort_128x(b->a.a, b->a.a + b->a.n);
|
||||
|
||||
// count and preallocate
|
||||
for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) {
|
||||
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
||||
++n_keys;
|
||||
if (n > 1) b->n += n;
|
||||
n = 1;
|
||||
} else ++n;
|
||||
}
|
||||
h = kh_init(idx);
|
||||
kh_resize(idx, h, n_keys);
|
||||
b->p = (uint64_t*)calloc(b->n, 8);
|
||||
|
||||
// create the hash table
|
||||
for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) {
|
||||
if (j == b->a.n || b->a.a[j].x>>8 != b->a.a[j-1].x>>8) {
|
||||
khint_t itr;
|
||||
int absent;
|
||||
mm128_t *p = &b->a.a[j-1];
|
||||
itr = kh_put(idx, h, p->x>>8>>mi->b<<1, &absent);
|
||||
assert(absent && j - start_a == n);
|
||||
if (n == 1) {
|
||||
kh_key(h, itr) |= 1;
|
||||
kh_val(h, itr) = p->y;
|
||||
} else {
|
||||
int k;
|
||||
for (k = 0; k < n; ++k)
|
||||
b->p[start_p + k] = b->a.a[start_a + k].y;
|
||||
kh_val(h, itr) = (uint64_t)start_p<<32 | n;
|
||||
start_p += n;
|
||||
}
|
||||
start_a = j, n = 1;
|
||||
} else ++n;
|
||||
}
|
||||
b->h = h;
|
||||
assert(b->n == start_p);
|
||||
|
||||
// deallocate and clear b->a
|
||||
free(b->a.a);
|
||||
b->a.n = b->a.m = 0, b->a.a = 0;
|
||||
}
|
||||
|
||||
static void mm_idx_post(mm_idx_t *mi, int n_threads)
|
||||
{
|
||||
kt_for(n_threads, worker_post, mi, 1<<mi->b);
|
||||
}
|
||||
|
||||
/******************
|
||||
* Generate index *
|
||||
******************/
|
||||
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include "bseq.h"
|
||||
|
||||
typedef struct {
|
||||
int mini_batch_size, keep_name, is_hpc;
|
||||
uint64_t batch_size;
|
||||
bseq_file_t *fp;
|
||||
mm_idx_t *mi;
|
||||
} pipeline_t;
|
||||
|
||||
typedef struct {
|
||||
int n_seq;
|
||||
bseq1_t *seq;
|
||||
mm128_v a;
|
||||
} step_t;
|
||||
|
||||
static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a)
|
||||
{
|
||||
int i, mask = (1<<mi->b) - 1;
|
||||
for (i = 0; i < n; ++i) {
|
||||
mm128_v *p = &mi->B[a[i].x&mask].a;
|
||||
kv_push(mm128_t, 0, *p, a[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void *worker_pipeline(void *shared, int step, void *in)
|
||||
{
|
||||
int i;
|
||||
pipeline_t *p = (pipeline_t*)shared;
|
||||
if (step == 0) { // step 0: read sequences
|
||||
step_t *s;
|
||||
if (p->mi->sum_len > p->batch_size) return 0;
|
||||
s = (step_t*)calloc(1, sizeof(step_t));
|
||||
s->seq = bseq_read(p->fp, p->mini_batch_size, &s->n_seq); // read a mini-batch
|
||||
if (s->seq) {
|
||||
uint32_t old_m, m;
|
||||
uint64_t sum_len, old_max_len, max_len;
|
||||
assert((uint64_t)p->mi->n_seq + s->n_seq <= UINT32_MAX); // to prevent integer overflow
|
||||
// make room for p->mi->seq
|
||||
old_m = p->mi->n_seq, m = p->mi->n_seq + s->n_seq;
|
||||
kroundup32(m); kroundup32(old_m);
|
||||
if (old_m != m)
|
||||
p->mi->seq = (mm_idx_seq_t*)realloc(p->mi->seq, m * sizeof(mm_idx_seq_t));
|
||||
// make room for p->mi->S
|
||||
for (i = 0, sum_len = 0; i < s->n_seq; ++i) sum_len += s->seq[i].l_seq;
|
||||
old_max_len = (p->mi->sum_len + 7) / 8;
|
||||
max_len = (p->mi->sum_len + sum_len + 7) / 8;
|
||||
kroundup64(old_max_len); kroundup64(max_len);
|
||||
if (old_max_len != max_len) {
|
||||
p->mi->S = (uint32_t*)realloc(p->mi->S, max_len * 4);
|
||||
memset(&p->mi->S[old_max_len], 0, 4 * (max_len - old_max_len));
|
||||
}
|
||||
// populate p->mi->seq
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
mm_idx_seq_t *seq = &p->mi->seq[p->mi->n_seq++];
|
||||
uint32_t j;
|
||||
if (p->keep_name) {
|
||||
assert(strlen(s->seq[i].name) <= 254); // a long query name breaks BAM
|
||||
seq->name = strdup(s->seq[i].name);
|
||||
} else seq->name = 0;
|
||||
seq->len = s->seq[i].l_seq;
|
||||
seq->offset = p->mi->sum_len;
|
||||
// copy the sequence
|
||||
for (j = 0; j < seq->len; ++j) { // TODO: this is not the fastest way, but let's first see if speed matters here
|
||||
uint64_t o = p->mi->sum_len + j;
|
||||
int c = seq_nt4_table[(uint8_t)s->seq[i].seq[j]];
|
||||
mm_seq4_set(p->mi->S, o, c);
|
||||
}
|
||||
// update p->mi->{sum_len,n_seq}
|
||||
p->mi->sum_len += seq->len;
|
||||
s->seq[i].rid = p->mi->n_seq++;
|
||||
}
|
||||
return s;
|
||||
} else free(s);
|
||||
} else if (step == 1) { // step 1: compute sketch
|
||||
step_t *s = (step_t*)in;
|
||||
for (i = 0; i < s->n_seq; ++i) {
|
||||
bseq1_t *t = &s->seq[i];
|
||||
mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->is_hpc, &s->a);
|
||||
free(t->seq); free(t->name);
|
||||
}
|
||||
free(s->seq); s->seq = 0;
|
||||
return s;
|
||||
} else if (step == 2) { // dispatch sketch to buckets
|
||||
step_t *s = (step_t*)in;
|
||||
mm_idx_add(p->mi, s->a.n, s->a.a);
|
||||
free(s->a.a); free(s);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
|
||||
{
|
||||
pipeline_t pl;
|
||||
memset(&pl, 0, sizeof(pipeline_t));
|
||||
pl.mini_batch_size = mini_batch_size;
|
||||
pl.keep_name = keep_name;
|
||||
pl.is_hpc = is_hpc;
|
||||
pl.batch_size = batch_size;
|
||||
pl.fp = fp;
|
||||
if (pl.fp == 0) return 0;
|
||||
pl.mi = mm_idx_init(w, k, b);
|
||||
|
||||
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
||||
|
||||
mm_idx_post(pl.mi, n_threads);
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
||||
|
||||
return pl.mi;
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads) // a simpler interface
|
||||
{
|
||||
bseq_file_t *fp;
|
||||
mm_idx_t *mi;
|
||||
fp = bseq_open(fn);
|
||||
if (fp == 0) return 0;
|
||||
mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, is_hpc, 1<<18, n_threads, UINT64_MAX, 1);
|
||||
bseq_close(fp);
|
||||
return mi;
|
||||
}
|
||||
|
||||
/*************
|
||||
* index I/O *
|
||||
*************/
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
||||
{
|
||||
uint32_t x[4];
|
||||
int i;
|
||||
x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n_seq;
|
||||
fwrite(MM_IDX_MAGIC, 1, 4, fp);
|
||||
fwrite(x, 4, 4, fp);
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
uint8_t l;
|
||||
l = strlen(mi->seq[i].name);
|
||||
fwrite(&l, 1, 1, fp);
|
||||
fwrite(mi->seq[i].name, 1, l, fp);
|
||||
fwrite(&mi->seq[i].len, 4, 1, fp);
|
||||
}
|
||||
for (i = 0; i < 1<<mi->b; ++i) {
|
||||
mm_idx_bucket_t *b = &mi->B[i];
|
||||
khint_t k;
|
||||
idxhash_t *h = (idxhash_t*)b->h;
|
||||
uint32_t size = h? h->size : 0;
|
||||
fwrite(&b->n, 4, 1, fp);
|
||||
fwrite(b->p, 8, b->n, fp);
|
||||
fwrite(&size, 4, 1, fp);
|
||||
if (size == 0) continue;
|
||||
for (k = 0; k < kh_end(h); ++k) {
|
||||
uint64_t x[2];
|
||||
if (!kh_exist(h, k)) continue;
|
||||
x[0] = kh_key(h, k), x[1] = kh_val(h, k);
|
||||
fwrite(x, 8, 2, fp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mm_idx_t *mm_idx_load(FILE *fp)
|
||||
{
|
||||
int i;
|
||||
char magic[4];
|
||||
uint32_t x[4];
|
||||
mm_idx_t *mi;
|
||||
if (fread(magic, 1, 4, fp) != 4) return 0;
|
||||
if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0;
|
||||
if (fread(x, 4, 4, fp) != 6) return 0;
|
||||
mi = mm_idx_init(x[0], x[1], x[2]);
|
||||
mi->n_seq = x[3];
|
||||
mi->seq = (mm_idx_seq_t*)calloc(mi->n_seq, sizeof(mm_idx_seq_t));
|
||||
for (i = 0; i < mi->n_seq; ++i) {
|
||||
uint8_t l;
|
||||
fread(&l, 1, 1, fp);
|
||||
mi->seq[i].name = (char*)malloc(l + 1);
|
||||
fread(mi->seq[i].name, 1, l, fp);
|
||||
mi->seq[i].name[l] = 0;
|
||||
fread(&mi->seq[i].len, 4, 1, fp);
|
||||
}
|
||||
for (i = 0; i < 1<<mi->b; ++i) {
|
||||
mm_idx_bucket_t *b = &mi->B[i];
|
||||
uint32_t j, size;
|
||||
khint_t k;
|
||||
idxhash_t *h;
|
||||
fread(&b->n, 4, 1, fp);
|
||||
b->p = (uint64_t*)malloc(b->n * 8);
|
||||
fread(b->p, 8, b->n, fp);
|
||||
fread(&size, 4, 1, fp);
|
||||
if (size == 0) continue;
|
||||
b->h = h = kh_init(idx);
|
||||
kh_resize(idx, h, size);
|
||||
for (j = 0; j < size; ++j) {
|
||||
uint64_t x[2];
|
||||
int absent;
|
||||
fread(x, 8, 2, fp);
|
||||
k = kh_put(idx, h, x[0], &absent);
|
||||
assert(absent);
|
||||
kh_val(h, k) = x[1];
|
||||
}
|
||||
}
|
||||
return mi;
|
||||
}
|
||||
|
|
@ -0,0 +1,615 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_INT(32, char)
|
||||
int main() {
|
||||
int ret, is_missing;
|
||||
khiter_t k;
|
||||
khash_t(32) *h = kh_init(32);
|
||||
k = kh_put(32, h, 5, &ret);
|
||||
kh_value(h, k) = 10;
|
||||
k = kh_get(32, h, 10);
|
||||
is_missing = (k == kh_end(h));
|
||||
k = kh_get(32, h, 5);
|
||||
kh_del(32, h, k);
|
||||
for (k = kh_begin(h); k != kh_end(h); ++k)
|
||||
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
||||
kh_destroy(32, h);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2013-05-02 (0.2.8):
|
||||
|
||||
* Use quadratic probing. When the capacity is power of 2, stepping function
|
||||
i*(i+1)/2 guarantees to traverse each bucket. It is better than double
|
||||
hashing on cache performance and is more robust than linear probing.
|
||||
|
||||
In theory, double hashing should be more robust than quadratic probing.
|
||||
However, my implementation is probably not for large hash tables, because
|
||||
the second hash function is closely tied to the first hash function,
|
||||
which reduce the effectiveness of double hashing.
|
||||
|
||||
Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
|
||||
|
||||
2011-12-29 (0.2.7):
|
||||
|
||||
* Minor code clean up; no actual effect.
|
||||
|
||||
2011-09-16 (0.2.6):
|
||||
|
||||
* The capacity is a power of 2. This seems to dramatically improve the
|
||||
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
|
||||
|
||||
- http://code.google.com/p/ulib/
|
||||
- http://nothings.org/computer/judy/
|
||||
|
||||
* Allow to optionally use linear probing which usually has better
|
||||
performance for random input. Double hashing is still the default as it
|
||||
is more robust to certain non-random input.
|
||||
|
||||
* Added Wang's integer hash function (not used by default). This hash
|
||||
function is more robust to certain non-random input.
|
||||
|
||||
2011-02-14 (0.2.5):
|
||||
|
||||
* Allow to declare global functions.
|
||||
|
||||
2009-09-26 (0.2.4):
|
||||
|
||||
* Improve portability
|
||||
|
||||
2008-09-19 (0.2.3):
|
||||
|
||||
* Corrected the example
|
||||
* Improved interfaces
|
||||
|
||||
2008-09-11 (0.2.2):
|
||||
|
||||
* Improved speed a little in kh_put()
|
||||
|
||||
2008-09-10 (0.2.1):
|
||||
|
||||
* Added kh_clear()
|
||||
* Fixed a compiling error
|
||||
|
||||
2008-09-02 (0.2.0):
|
||||
|
||||
* Changed to token concatenation which increases flexibility.
|
||||
|
||||
2008-08-31 (0.1.2):
|
||||
|
||||
* Fixed a bug in kh_get(), which has not been tested previously.
|
||||
|
||||
2008-08-31 (0.1.1):
|
||||
|
||||
* Added destructor
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __AC_KHASH_H
|
||||
#define __AC_KHASH_H
|
||||
|
||||
/*!
|
||||
@header
|
||||
|
||||
Generic hash table library.
|
||||
*/
|
||||
|
||||
#define AC_VERSION_KHASH_H "0.2.8"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "kalloc.h"
|
||||
|
||||
/* compiler specific configuration */
|
||||
|
||||
#if UINT_MAX == 0xffffffffu
|
||||
typedef unsigned int khint32_t;
|
||||
#elif ULONG_MAX == 0xffffffffu
|
||||
typedef unsigned long khint32_t;
|
||||
#endif
|
||||
|
||||
#if ULONG_MAX == ULLONG_MAX
|
||||
typedef unsigned long khint64_t;
|
||||
#else
|
||||
typedef unsigned long long khint64_t;
|
||||
#endif
|
||||
|
||||
#ifndef kh_inline
|
||||
#ifdef _MSC_VER
|
||||
#define kh_inline __inline
|
||||
#else
|
||||
#define kh_inline inline
|
||||
#endif
|
||||
#endif /* kh_inline */
|
||||
|
||||
#ifndef klib_unused
|
||||
#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
|
||||
#define klib_unused __attribute__ ((__unused__))
|
||||
#else
|
||||
#define klib_unused
|
||||
#endif
|
||||
#endif /* klib_unused */
|
||||
|
||||
typedef khint32_t khint_t;
|
||||
typedef khint_t khiter_t;
|
||||
|
||||
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
||||
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
||||
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
||||
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
||||
|
||||
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
static const double __ac_HASH_UPPER = 0.77;
|
||||
|
||||
#define __KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
typedef struct kh_##name##_s { \
|
||||
khint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t;
|
||||
|
||||
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
|
||||
extern kh_##name##_t *kh_init_##name(void); \
|
||||
extern void kh_destroy_##name(kh_##name##_t *h); \
|
||||
extern void kh_clear_##name(kh_##name##_t *h); \
|
||||
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
|
||||
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
|
||||
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
|
||||
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
|
||||
|
||||
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
SCOPE kh_##name##_t *kh_init_##name(void) { \
|
||||
return (kh_##name##_t*)kcalloc(0, 1, sizeof(kh_##name##_t)); \
|
||||
} \
|
||||
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h) { \
|
||||
kfree(0, (void *)h->keys); kfree(0, h->flags); \
|
||||
kfree(0, (void *)h->vals); \
|
||||
kfree(0, h); \
|
||||
} \
|
||||
} \
|
||||
SCOPE void kh_clear_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h && h->flags) { \
|
||||
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
|
||||
h->size = h->n_occupied = 0; \
|
||||
} \
|
||||
} \
|
||||
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
|
||||
{ \
|
||||
if (h->n_buckets) { \
|
||||
khint_t k, i, last, mask, step = 0; \
|
||||
mask = h->n_buckets - 1; \
|
||||
k = __hash_func(key); i = k & mask; \
|
||||
last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
i = (i + (++step)) & mask; \
|
||||
if (i == last) return h->n_buckets; \
|
||||
} \
|
||||
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
||||
} else return 0; \
|
||||
} \
|
||||
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
||||
{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
|
||||
khint32_t *new_flags = 0; \
|
||||
khint_t j = 1; \
|
||||
{ \
|
||||
kroundup32(new_n_buckets); \
|
||||
if (new_n_buckets < 4) new_n_buckets = 4; \
|
||||
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
|
||||
else { /* hash table size to be changed (shrink or expand); rehash */ \
|
||||
new_flags = (khint32_t*)kmalloc(0, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (!new_flags) return -1; \
|
||||
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { /* expand */ \
|
||||
khkey_t *new_keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (!new_keys) { kfree(0, new_flags); return -1; } \
|
||||
h->keys = new_keys; \
|
||||
if (kh_is_map) { \
|
||||
khval_t *new_vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
if (!new_vals) { kfree(0, new_flags); return -1; } \
|
||||
h->vals = new_vals; \
|
||||
} \
|
||||
} /* otherwise shrink */ \
|
||||
} \
|
||||
} \
|
||||
if (j) { /* rehashing is needed */ \
|
||||
for (j = 0; j != h->n_buckets; ++j) { \
|
||||
if (__ac_iseither(h->flags, j) == 0) { \
|
||||
khkey_t key = h->keys[j]; \
|
||||
khval_t val; \
|
||||
khint_t new_mask; \
|
||||
new_mask = new_n_buckets - 1; \
|
||||
if (kh_is_map) val = h->vals[j]; \
|
||||
__ac_set_isdel_true(h->flags, j); \
|
||||
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
|
||||
khint_t k, i, step = 0; \
|
||||
k = __hash_func(key); \
|
||||
i = k & new_mask; \
|
||||
while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
|
||||
__ac_set_isempty_false(new_flags, i); \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
|
||||
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
||||
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
||||
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
|
||||
} else { /* write the element and jump out of the loop */ \
|
||||
h->keys[i] = key; \
|
||||
if (kh_is_map) h->vals[i] = val; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
|
||||
h->keys = (khkey_t*)krealloc(0, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) h->vals = (khval_t*)krealloc(0, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
kfree(0, h->flags); /* free the working space */ \
|
||||
h->flags = new_flags; \
|
||||
h->n_buckets = new_n_buckets; \
|
||||
h->n_occupied = h->size; \
|
||||
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
{ \
|
||||
khint_t x; \
|
||||
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
|
||||
if (h->n_buckets > (h->size<<1)) { \
|
||||
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
|
||||
{ \
|
||||
khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
|
||||
else { \
|
||||
last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (__ac_isdel(h->flags, i)) site = i; \
|
||||
i = (i + (++step)) & mask; \
|
||||
if (i == last) { x = site; break; } \
|
||||
} \
|
||||
if (x == h->n_buckets) { \
|
||||
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
||||
else x = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; ++h->n_occupied; \
|
||||
*ret = 1; \
|
||||
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; \
|
||||
*ret = 2; \
|
||||
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
|
||||
return x; \
|
||||
} \
|
||||
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
||||
{ \
|
||||
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
||||
__ac_set_isdel_true(h->flags, x); \
|
||||
--h->size; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define KHASH_DECLARE(name, khkey_t, khval_t) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
|
||||
|
||||
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
/* --- BEGIN OF HASH FUNCTIONS --- */
|
||||
|
||||
/*! @function
|
||||
@abstract Integer hash function
|
||||
@param key The integer [khint32_t]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_int_hash_func(key) (khint32_t)(key)
|
||||
/*! @function
|
||||
@abstract Integer comparison function
|
||||
*/
|
||||
#define kh_int_hash_equal(a, b) ((a) == (b))
|
||||
/*! @function
|
||||
@abstract 64-bit integer hash function
|
||||
@param key The integer [khint64_t]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
|
||||
/*! @function
|
||||
@abstract 64-bit integer comparison function
|
||||
*/
|
||||
#define kh_int64_hash_equal(a, b) ((a) == (b))
|
||||
/*! @function
|
||||
@abstract const char* hash function
|
||||
@param s Pointer to a null terminated string
|
||||
@return The hash value
|
||||
*/
|
||||
static kh_inline khint_t __ac_X31_hash_string(const char *s)
|
||||
{
|
||||
khint_t h = (khint_t)*s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
|
||||
return h;
|
||||
}
|
||||
/*! @function
|
||||
@abstract Another interface to const char* hash function
|
||||
@param key Pointer to a null terminated string [const char*]
|
||||
@return The hash value [khint_t]
|
||||
*/
|
||||
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
||||
/*! @function
|
||||
@abstract Const char* comparison function
|
||||
*/
|
||||
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
||||
|
||||
static kh_inline khint_t __ac_Wang_hash(khint_t key)
|
||||
{
|
||||
key += ~(key << 15);
|
||||
key ^= (key >> 10);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 6);
|
||||
key += ~(key << 11);
|
||||
key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
|
||||
|
||||
/* --- END OF HASH FUNCTIONS --- */
|
||||
|
||||
/* Other convenient macros... */
|
||||
|
||||
/*!
|
||||
@abstract Type of the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define khash_t(name) kh_##name##_t
|
||||
|
||||
/*! @function
|
||||
@abstract Initiate a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@return Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_init(name) kh_init_##name()
|
||||
|
||||
/*! @function
|
||||
@abstract Destroy a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_destroy(name, h) kh_destroy_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Reset a hash table without deallocating memory.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
*/
|
||||
#define kh_clear(name, h) kh_clear_##name(h)
|
||||
|
||||
/*! @function
|
||||
@abstract Resize a hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param s New size [khint_t]
|
||||
*/
|
||||
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
||||
|
||||
/*! @function
|
||||
@abstract Insert a key to the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@param r Extra return code: -1 if the operation failed;
|
||||
0 if the key is present in the hash table;
|
||||
1 if the bucket is empty (never used); 2 if the element in
|
||||
the bucket has been deleted [int*]
|
||||
@return Iterator to the inserted element [khint_t]
|
||||
*/
|
||||
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
||||
|
||||
/*! @function
|
||||
@abstract Retrieve a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Key [type of keys]
|
||||
@return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
|
||||
*/
|
||||
#define kh_get(name, h, k) kh_get_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Remove a key from the hash table.
|
||||
@param name Name of the hash table [symbol]
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param k Iterator to the element to be deleted [khint_t]
|
||||
*/
|
||||
#define kh_del(name, h, k) kh_del_##name(h, k)
|
||||
|
||||
/*! @function
|
||||
@abstract Test whether a bucket contains data.
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return 1 if containing data; 0 otherwise [int]
|
||||
*/
|
||||
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
||||
|
||||
/*! @function
|
||||
@abstract Get key given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return Key [type of keys]
|
||||
*/
|
||||
#define kh_key(h, x) ((h)->keys[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get value given an iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param x Iterator to the bucket [khint_t]
|
||||
@return Value [type of values]
|
||||
@discussion For hash sets, calling this results in segfault.
|
||||
*/
|
||||
#define kh_val(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Alias of kh_val()
|
||||
*/
|
||||
#define kh_value(h, x) ((h)->vals[x])
|
||||
|
||||
/*! @function
|
||||
@abstract Get the start iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The start iterator [khint_t]
|
||||
*/
|
||||
#define kh_begin(h) (khint_t)(0)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the end iterator
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return The end iterator [khint_t]
|
||||
*/
|
||||
#define kh_end(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of elements in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of elements in the hash table [khint_t]
|
||||
*/
|
||||
#define kh_size(h) ((h)->size)
|
||||
|
||||
/*! @function
|
||||
@abstract Get the number of buckets in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@return Number of buckets in the hash table [khint_t]
|
||||
*/
|
||||
#define kh_n_buckets(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the entries in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param kvar Variable to which key will be assigned
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(kvar) = kh_key(h,__i); \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the values in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/* More conenient interfaces */
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash set containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_INT(name) \
|
||||
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
||||
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_INT64(name) \
|
||||
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing 64-bit integer keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
||||
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
typedef const char *kh_cstr_t;
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
*/
|
||||
#define KHASH_SET_INIT_STR(name) \
|
||||
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
/*! @function
|
||||
@abstract Instantiate a hash map containing const char* keys
|
||||
@param name Name of the hash table [symbol]
|
||||
@param khval_t Type of values [type]
|
||||
*/
|
||||
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
||||
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
#endif /* __AC_KHASH_H */
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
// This is a simplified version of ksort.h
|
||||
|
||||
#ifndef AC_KSORT_H
|
||||
#define AC_KSORT_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef struct {
|
||||
void *left, *right;
|
||||
int depth;
|
||||
} ks_isort_stack_t;
|
||||
|
||||
#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
|
||||
|
||||
#define KSORT_INIT(name, type_t, __sort_lt) \
|
||||
size_t ks_lis_##name(size_t n, const type_t *a, size_t *b, size_t *_p) \
|
||||
{ /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \
|
||||
size_t i, u, v, *top = b, *p; \
|
||||
if (n == 0) return 0; \
|
||||
p = _p? _p : (size_t*)calloc(n, sizeof(size_t)); \
|
||||
*top++ = 0; \
|
||||
for (i = 1; i < n; i++) { \
|
||||
if (__sort_lt(a[*(top-1)], a[i])) { \
|
||||
p[i] = *(top-1); \
|
||||
*top++ = i; \
|
||||
continue; \
|
||||
} \
|
||||
for (u = 0, v = top - b - 1; u < v;) { \
|
||||
size_t c = (u + v) >> 1; \
|
||||
if (__sort_lt(a[b[c]], a[i])) u = c + 1; \
|
||||
else v = c; \
|
||||
} \
|
||||
if (__sort_lt(a[i], a[b[u]])) { \
|
||||
if (u > 0) p[i] = b[u-1]; \
|
||||
b[u] = i; \
|
||||
} \
|
||||
} \
|
||||
for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \
|
||||
if (!_p) free(p); \
|
||||
return top - b; \
|
||||
} \
|
||||
type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
|
||||
{ \
|
||||
type_t *low, *high, *k, *ll, *hh, *mid; \
|
||||
low = arr; high = arr + n - 1; k = arr + kk; \
|
||||
for (;;) { \
|
||||
if (high <= low) return *k; \
|
||||
if (high == low + 1) { \
|
||||
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
|
||||
return *k; \
|
||||
} \
|
||||
mid = low + (high - low) / 2; \
|
||||
if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
|
||||
if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
|
||||
if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
|
||||
KSORT_SWAP(type_t, *mid, *(low+1)); \
|
||||
ll = low + 1; hh = high; \
|
||||
for (;;) { \
|
||||
do ++ll; while (__sort_lt(*ll, *low)); \
|
||||
do --hh; while (__sort_lt(*low, *hh)); \
|
||||
if (hh < ll) break; \
|
||||
KSORT_SWAP(type_t, *ll, *hh); \
|
||||
} \
|
||||
KSORT_SWAP(type_t, *low, *hh); \
|
||||
if (hh <= k) low = ll; \
|
||||
if (hh >= k) high = hh - 1; \
|
||||
} \
|
||||
} \
|
||||
|
||||
#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
|
||||
|
||||
#define ks_lt_generic(a, b) ((a) < (b))
|
||||
#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
|
||||
|
||||
typedef const char *ksstr_t;
|
||||
|
||||
#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
|
||||
#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
|
||||
|
||||
#define RS_MIN_SIZE 64
|
||||
|
||||
#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
|
||||
typedef struct { \
|
||||
rstype_t *b, *e; \
|
||||
} rsbucket_##name##_t; \
|
||||
void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
|
||||
{ \
|
||||
rstype_t *i; \
|
||||
for (i = beg + 1; i < end; ++i) \
|
||||
if (rskey(*i) < rskey(*(i - 1))) { \
|
||||
rstype_t *j, tmp = *i; \
|
||||
for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
|
||||
*j = *(j - 1); \
|
||||
*j = tmp; \
|
||||
} \
|
||||
} \
|
||||
void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
|
||||
{ \
|
||||
rstype_t *i; \
|
||||
int size = 1<<n_bits, m = size - 1; \
|
||||
rsbucket_##name##_t *k, b[size], *be = b + size; \
|
||||
for (k = b; k != be; ++k) k->b = k->e = beg; \
|
||||
for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
|
||||
for (k = b + 1; k != be; ++k) \
|
||||
k->e += (k-1)->e - beg, k->b = (k-1)->e; \
|
||||
for (k = b; k != be;) { \
|
||||
if (k->b != k->e) { \
|
||||
rsbucket_##name##_t *l; \
|
||||
if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
|
||||
rstype_t tmp = *k->b, swap; \
|
||||
do { \
|
||||
swap = tmp; tmp = *l->b; *l->b++ = swap; \
|
||||
l = b + (rskey(tmp)>>s&m); \
|
||||
} while (l != k); \
|
||||
*k->b++ = tmp; \
|
||||
} else ++k->b; \
|
||||
} else ++k; \
|
||||
} \
|
||||
for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
|
||||
if (s) { \
|
||||
s = s > n_bits? s - n_bits : 0; \
|
||||
for (k = b; k != be; ++k) \
|
||||
if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
|
||||
else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
|
||||
} \
|
||||
} \
|
||||
void radix_sort_##name(rstype_t *beg, rstype_t *end) \
|
||||
{ \
|
||||
if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
|
||||
else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
|
||||
/************
|
||||
* kt_for() *
|
||||
************/
|
||||
|
||||
struct kt_for_t;
|
||||
|
||||
typedef struct {
|
||||
struct kt_for_t *t;
|
||||
long i;
|
||||
} ktf_worker_t;
|
||||
|
||||
typedef struct kt_for_t {
|
||||
int n_threads;
|
||||
long n;
|
||||
ktf_worker_t *w;
|
||||
void (*func)(void*,long,int);
|
||||
void *data;
|
||||
} kt_for_t;
|
||||
|
||||
static inline long steal_work(kt_for_t *t)
|
||||
{
|
||||
int i, min_i = -1;
|
||||
long k, min = LONG_MAX;
|
||||
for (i = 0; i < t->n_threads; ++i)
|
||||
if (min > t->w[i].i) min = t->w[i].i, min_i = i;
|
||||
k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
|
||||
return k >= t->n? -1 : k;
|
||||
}
|
||||
|
||||
static void *ktf_worker(void *data)
|
||||
{
|
||||
ktf_worker_t *w = (ktf_worker_t*)data;
|
||||
long i;
|
||||
for (;;) {
|
||||
i = __sync_fetch_and_add(&w->i, w->t->n_threads);
|
||||
if (i >= w->t->n) break;
|
||||
w->t->func(w->t->data, i, w - w->t->w);
|
||||
}
|
||||
while ((i = steal_work(w->t)) >= 0)
|
||||
w->t->func(w->t->data, i, w - w->t->w);
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
|
||||
{
|
||||
if (n_threads > 1) {
|
||||
int i;
|
||||
kt_for_t t;
|
||||
pthread_t *tid;
|
||||
t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
|
||||
t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
|
||||
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||
for (i = 0; i < n_threads; ++i)
|
||||
t.w[i].t = &t, t.w[i].i = i;
|
||||
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
|
||||
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||
} else {
|
||||
long j;
|
||||
for (j = 0; j < n; ++j) func(data, j, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*****************
|
||||
* kt_pipeline() *
|
||||
*****************/
|
||||
|
||||
struct ktp_t;
|
||||
|
||||
typedef struct {
|
||||
struct ktp_t *pl;
|
||||
int64_t index;
|
||||
int step;
|
||||
void *data;
|
||||
} ktp_worker_t;
|
||||
|
||||
typedef struct ktp_t {
|
||||
void *shared;
|
||||
void *(*func)(void*, int, void*);
|
||||
int64_t index;
|
||||
int n_workers, n_steps;
|
||||
ktp_worker_t *workers;
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t cv;
|
||||
} ktp_t;
|
||||
|
||||
static void *ktp_worker(void *data)
|
||||
{
|
||||
ktp_worker_t *w = (ktp_worker_t*)data;
|
||||
ktp_t *p = w->pl;
|
||||
while (w->step < p->n_steps) {
|
||||
// test whether we can kick off the job with this worker
|
||||
pthread_mutex_lock(&p->mutex);
|
||||
for (;;) {
|
||||
int i;
|
||||
// test whether another worker is doing the same step
|
||||
for (i = 0; i < p->n_workers; ++i) {
|
||||
if (w == &p->workers[i]) continue; // ignore itself
|
||||
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
|
||||
break;
|
||||
}
|
||||
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
|
||||
pthread_cond_wait(&p->cv, &p->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&p->mutex);
|
||||
|
||||
// working on w->step
|
||||
w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
|
||||
|
||||
// update step and let other workers know
|
||||
pthread_mutex_lock(&p->mutex);
|
||||
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
|
||||
if (w->step == 0) w->index = p->index++;
|
||||
pthread_cond_broadcast(&p->cv);
|
||||
pthread_mutex_unlock(&p->mutex);
|
||||
}
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
|
||||
{
|
||||
ktp_t aux;
|
||||
pthread_t *tid;
|
||||
int i;
|
||||
|
||||
if (n_threads < 1) n_threads = 1;
|
||||
aux.n_workers = n_threads;
|
||||
aux.n_steps = n_steps;
|
||||
aux.func = func;
|
||||
aux.shared = shared_data;
|
||||
aux.index = 0;
|
||||
pthread_mutex_init(&aux.mutex, 0);
|
||||
pthread_cond_init(&aux.cv, 0);
|
||||
|
||||
aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
|
||||
for (i = 0; i < n_threads; ++i) {
|
||||
ktp_worker_t *w = &aux.workers[i];
|
||||
w->step = 0; w->pl = &aux; w->data = 0;
|
||||
w->index = aux.index++;
|
||||
}
|
||||
|
||||
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
|
||||
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
|
||||
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
|
||||
|
||||
pthread_mutex_destroy(&aux.mutex);
|
||||
pthread_cond_destroy(&aux.cv);
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#ifndef KTHREAD_H
|
||||
#define KTHREAD_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
|
||||
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
29
minimap.h
29
minimap.h
|
|
@ -4,7 +4,6 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include "bseq.h"
|
||||
|
||||
#define MM_IDX_DEF_B 14
|
||||
#define MM_DEREP_Q50 5.0
|
||||
|
|
@ -30,13 +29,18 @@ typedef struct {
|
|||
} mm_idx_bucket_t;
|
||||
|
||||
typedef struct {
|
||||
int b, w, k;
|
||||
uint32_t n; // number of reference sequences
|
||||
mm_idx_bucket_t *B;
|
||||
uint32_t max_occ;
|
||||
float freq_thres;
|
||||
int32_t *len; // length of each reference sequence
|
||||
char **name; // TODO: if this uses too much RAM, switch to one concatenated string
|
||||
char *name; // name of the db sequence
|
||||
uint64_t offset; // offset in mm_idx_t::seq16
|
||||
uint32_t len; // length
|
||||
} mm_idx_seq_t;
|
||||
|
||||
typedef struct {
|
||||
int32_t b, w, k;
|
||||
uint64_t sum_len; // sum of lengths
|
||||
uint32_t n_seq; // number of reference sequences
|
||||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
mm_idx_bucket_t *B; // index
|
||||
} mm_idx_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -62,6 +66,11 @@ extern double mm_realtime0;
|
|||
struct mm_tbuf_s;
|
||||
typedef struct mm_tbuf_s mm_tbuf_t;
|
||||
|
||||
struct bseq_file_s;
|
||||
|
||||
#define mm_seq4_set(s, i, c) ((s)[(i)>>8] |= (uint32_t)(c) << (((i)&7)<<2))
|
||||
#define mm_seq4_get(s, i) ((s)[(i)>>8] >> (((i)&7)<<2) & 0xf)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -72,11 +81,11 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i
|
|||
// minimizer indexing
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b);
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
|
||||
mm_idx_t *mm_idx_gen(struct bseq_file_s *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name);
|
||||
void mm_idx_set_max_occ(mm_idx_t *mi, float f);
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads);
|
||||
|
||||
// minimizer index I/O
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#include "minimap.h"
|
||||
|
||||
int mm_verbose = 3;
|
||||
double mm_realtime0;
|
||||
|
||||
double cputime()
|
||||
{
|
||||
struct rusage r;
|
||||
getrusage(RUSAGE_SELF, &r);
|
||||
return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
|
||||
}
|
||||
|
||||
double realtime()
|
||||
{
|
||||
struct timeval tp;
|
||||
struct timezone tzp;
|
||||
gettimeofday(&tp, &tzp);
|
||||
return tp.tv_sec + tp.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
#include "ksort.h"
|
||||
#define sort_key_128x(a) ((a).x)
|
||||
KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8)
|
||||
KSORT_INIT_GENERIC(uint32_t)
|
||||
Loading…
Reference in New Issue