Multiple changes:

1. Removed bwa.{h,c}. I am not going to finish them anyway.
2. Updated to the latest khash.h, which should be faster.
3. Define 64-bit vector and 128-bit integer/vector in utils.h.
This commit is contained in:
Heng Li 2013-02-12 09:50:28 -05:00
parent 13288e2dcd
commit e5ab59db53
9 changed files with 261 additions and 549 deletions

View File

@ -1,10 +1,9 @@
CC= gcc CC= clang
CXX= g++
CFLAGS= -g -Wall -O2 CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS) CXXFLAGS= $(CFLAGS)
AR= ar AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \
bseq.o bwaseqio.o bwase.o kstring.o bseq.o bwaseqio.o bwase.o kstring.o
AOBJS= QSufSort.o bwt_gen.o \ AOBJS= QSufSort.o bwt_gen.o \
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \

272
bwa.c
View File

@ -1,272 +0,0 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "bwa.h"
#include "bwt.h"
#include "bwtgap.h"
#include "bntseq.h"
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
extern unsigned char nst_nt4_table[256];
extern void seq_reverse(int len, uint8_t *seq, int is_comp);
bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
struct bwa_idx_t {
bwt_t *bwt;
bntseq_t *bns;
uint8_t *pac;
};
struct bwa_buf_t {
int max_buf;
bwa_pestat_t pes;
gap_stack_t *stack;
gap_opt_t *opt;
int *diff_tab;
uint8_t *buf;
int *logn;
};
bwa_idx_t *bwa_idx_load(const char *prefix)
{
bwa_idx_t *p;
int l;
char *str;
l = strlen(prefix);
p = calloc(1, sizeof(bwa_idx_t));
str = malloc(l + 10);
strcpy(str, prefix);
p->bns = bns_restore(str);
strcpy(str + l, ".bwt");
p->bwt = bwt_restore_bwt(str);
str[l] = 0;
strcpy(str + l, ".sa");
bwt_restore_sa(str, p->bwt);
free(str);
p->pac = calloc(p->bns->l_pac/4+1, 1);
fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
fclose(p->bns->fp_pac);
p->bns->fp_pac = 0;
return p;
}
void bwa_idx_destroy(bwa_idx_t *p)
{
bns_destroy(p->bns);
bwt_destroy(p->bwt);
free(p->pac);
free(p);
}
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
{
extern gap_opt_t *gap_init_opt(void);
extern int bwa_cal_maxdiff(int l, double err, double thres);
int i;
bwa_buf_t *p;
p = malloc(sizeof(bwa_buf_t));
p->stack = gap_init_stack2(max_score);
p->opt = gap_init_opt();
p->opt->s_gapo = opt->s_gapo;
p->opt->s_gape = opt->s_gape;
p->opt->max_diff = opt->max_diff;
p->opt->max_gapo = opt->max_gapo;
p->opt->max_gape = opt->max_gape;
p->opt->seed_len = opt->seed_len;
p->opt->max_seed_diff = opt->max_seed_diff;
p->opt->fnr = opt->fnr;
p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int));
for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
p->logn = calloc(256, sizeof(int));
for (i = 1; i != 256; ++i)
p->logn[i] = (int)(4.343 * log(i) + 0.499);
return p;
}
void bwa_buf_destroy(bwa_buf_t *p)
{
gap_destroy_stack(p->stack);
free(p->diff_tab); free(p->logn); free(p->opt);
free(p);
}
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
{
extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
int i, seq_len, buf_len;
bwt_width_t *w, *seed_w;
uint8_t *s;
gap_opt_t opt2 = *buf->opt;
bwa_sai_t sai;
seq_len = strlen(seq);
// estimate the buffer length
buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
if (buf_len > buf->max_buf) {
buf->max_buf = buf_len;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
}
memset(buf->buf, 0, buf_len);
seed_w = (bwt_width_t*)buf->buf;
w = seed_w + buf->opt->seed_len;
s = (uint8_t*)(w + seq_len + 1);
if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
// copy the sequence
for (i = 0; i < seq_len; ++i)
s[i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s, 0);
// mapping
bwt_cal_width(idx->bwt, seq_len, s, w);
if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
if (seq_len > buf->opt->seed_len)
bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
for (i = 0; i < seq_len; ++i) // complement; I forgot why...
s[i] = s[i] > 3? 4 : 3 - s[i];
sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
return sai;
}
static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
{
uint64_t x = pos, z;
int k, y = 0;
*n_mm = *n_gaps = 0;
for (k = 0; k < n_cigar; ++k) {
int l = cigar[k]>>4;
int op = cigar[k]&0xf;
if (op == 0) { // match/mismatch
for (z = 0; z < l && x + z < l_pac; ++z) {
int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
}
}
if (op == 1 || op == 2) (*n_gaps) += l;
if (op == 0 || op == 2) x += l;
if (op == 0 || op == 1 || op == 4) y += l;
}
}
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
{
extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
int strand, seq_len, i, n_gap, n_mm;
uint64_t pos3, pac_pos;
uint8_t *s[2];
memset(aln, 0, sizeof(bwa_aln_t));
seq_len = strlen(seq);
if (seq_len<<1 > buf->max_buf) {
buf->max_buf = seq_len<<1;
kroundup32(buf->max_buf);
buf->buf = realloc(buf->buf, buf->max_buf);
}
s[0] = buf->buf;
s[1] = s[0] + seq_len;
for (i = 0; i < seq_len; ++i)
s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s[1], 1);
pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
if (strand) aln->flag |= 16;
if (n_gaps) { // only for gapped alignment
int n_cigar;
bwa_cigar_t *cigar16;
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
aln->n_cigar = n_cigar;
aln->cigar = malloc(n_cigar * 4);
for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
int op = cigar16[i]>>14;
int len = cigar16[i]&0x3fff;
if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
aln->cigar[i] = len<<4 | op;
if (op == 0 || op == 2) pos3 += len;
}
free(cigar16);
} else { // ungapped
aln->n_cigar = 1;
aln->cigar = malloc(4);
aln->cigar[0] = seq_len<<4 | 0;
pos3 = pac_pos + seq_len;
}
aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
aln->flag |= 4; // read unmapped
compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
aln->n_mm = n_mm;
aln->n_gap = n_gap;
}
/************************
* Single-end alignment *
************************/
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
{
bwa_one_t *one;
int best, cnt, i, seq_len;
seq_len = strlen(seq);
one = calloc(1, sizeof(bwa_one_t));
one->sai = bwa_sai(idx, buf, seq);
if (one->sai.n == 0) return one;
// count number of hits; randomly select one alignment
best = one->sai.sai[0].score;
for (i = cnt = 0; i < one->sai.n; ++i) {
bwa_sai1_t *p = &one->sai.sai[i];
if (p->score > best) break;
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
one->which = p;
one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
}
cnt += p->l - p->k + 1;
}
one->c1 = cnt;
for (; i < one->sai.n; ++i)
cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
one->c2 = cnt - one->c1;
// estimate single-end mapping quality
one->mapQs = -1;
if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
else if (one->c1 > 1) one->mapQs = 0;
else {
int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
else if (one->c2 == 0) one->mapQs = 37;
}
if (one->mapQs < 0) {
cnt = (one->c2 >= 255)? 255 : one->c2;
one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
}
one->mapQ = one->mapQs;
// compute CIGAR on request
one->one.ref_id = -1;
if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
return one;
}
void bwa_one_destroy(bwa_one_t *one)
{
free(one->sai.sai);
free(one->one.cigar);
free(one);
}
/************************
* Paired-end alignment *
************************/
void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
{
}
void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
{
}

107
bwa.h
View File

@ -1,107 +0,0 @@
#ifndef BWA_H_
#define BWA_H_
#include <stdint.h>
#define BWA_DEF_MAX_SCORE 2048
#define BWA_MAX_QUERY_LEN 1024
// BWA index
struct bwa_idx_t;
typedef struct bwa_idx_t bwa_idx_t;
// Buffer for BWA alignment
struct bwa_buf_t;
typedef struct bwa_buf_t bwa_buf_t;
// BWA alignment options
typedef struct {
int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
float fnr; // parameter for automatic length-adjusted max differences
} bwa_opt_t;
// default BWA alignment options
extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
// an interval hit in the SA coordinate; basic unit in .sai files
typedef struct {
uint32_t n_mm:16, n_gapo:8, n_gape:8;
int score;
uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
} bwa_sai1_t;
// all interval hits in the SA coordinate
typedef struct {
int n; // number of interval hits
bwa_sai1_t *sai;
} bwa_sai_t;
// an alignment
typedef struct {
uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
uint32_t offset; // coordinate on the reference; zero-based
uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
} bwa_aln_t;
typedef struct {
int mapQs, mapQ, c1, c2;
uint64_t sa;
bwa_sai1_t *which;
bwa_sai_t sai;
bwa_aln_t one;
} bwa_one_t;
typedef struct {
double avg, std, ap_prior;
uint64_t low, high, high_bayesian;
} bwa_pestat_t;
#ifdef __cplusplus
extern "C" {
#endif
// load a BWA index
bwa_idx_t *bwa_idx_load(const char *prefix);
void bwa_idx_destroy(bwa_idx_t *p);
// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
void bwa_buf_destroy(bwa_buf_t *p);
/**
* Find all the SA intervals
*
* @param idx BWA index; multiple threads can share the same index
* @param buf BWA alignment buffer; each thread should have its own buffer
* @param seq NULL terminated C string, consisting of A/C/G/T/N only
*
* @return SA intervals seq is matched to
*/
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
/**
* Construct an alignment in the base-pair coordinate
*
* @param idx BWA index
* @param buf BWA alignment buffer
* @param seq NULL terinated C string
* @param sa Suffix array value
* @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
*
* @return An alignment
*/
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
void bwa_one_destroy(bwa_one_t *one);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -4,6 +4,7 @@
#include "kstring.h" #include "kstring.h"
#include "bwamem.h" #include "bwamem.h"
#include "kvec.h" #include "kvec.h"
#include "utils.h"
#define MIN_RATIO 0.8 #define MIN_RATIO 0.8
#define MIN_DIR_CNT 10 #define MIN_DIR_CNT 10
@ -13,9 +14,6 @@
#define MAX_STDDEV 4.0 #define MAX_STDDEV 4.0
#define EXT_STDDEV 4.0 #define EXT_STDDEV 4.0
typedef kvec_t(uint64_t) vec64_t;
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard);
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
@ -34,9 +32,8 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
{ {
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
int i, d, max; int i, d, max;
vec64_t isize[4]; uint64_v isize[4];
memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(pes, 0, 4 * sizeof(mem_pestat_t));
memset(isize, 0, sizeof(kvec_t(int)) * 4); memset(isize, 0, sizeof(kvec_t(int)) * 4);
for (i = 0; i < n>>1; ++i) { for (i = 0; i < n>>1; ++i) {
@ -58,14 +55,14 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
mem_pestat_t *r = &pes[d]; mem_pestat_t *r = &pes[d];
vec64_t *q = &isize[d]; uint64_v *q = &isize[d];
int p25, p50, p75, x; int p25, p50, p75, x;
if (q->n < MIN_DIR_CNT) { if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1; r->failed = 1;
continue; continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_uint64_t(q->n, q->a); ks_introsort_64(q->n, q->a);
p25 = q->a[(int)(.25 * q->n + .499)]; p25 = q->a[(int)(.25 * q->n + .499)];
p50 = q->a[(int)(.50 * q->n + .499)]; p50 = q->a[(int)(.50 * q->n + .499)];
p75 = q->a[(int)(.75 * q->n + .499)]; p75 = q->a[(int)(.75 * q->n + .499)];
@ -102,7 +99,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2])
{ {
vec64_t v; uint64_v v;
int r, i, y[4]; // y[] keeps the last hit int r, i, y[4]; // y[] keeps the last hit
kv_init(v); kv_init(v);
for (r = 0; r < 2; ++r) { for (r = 0; r < 2; ++r) {
@ -113,7 +110,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con
kv_push(uint64_t, v, key); kv_push(uint64_t, v, key);
} }
} }
ks_introsort_uint64_t(v.n, v.a); ks_introsort_64(v.n, v.a);
y[0] = y[1] = y[2] = y[3] = -1; y[0] = y[1] = y[2] = y[3] = -1;
printf("**** %ld\n", v.n); printf("**** %ld\n", v.n);
for (i = 0; i < v.n; ++i) { for (i = 0; i < v.n; ++i) {

33
bwape.c
View File

@ -21,24 +21,15 @@ typedef struct {
bwtint_t low, high, high_bayesian; bwtint_t low, high, high_bayesian;
} isize_info_t; } isize_info_t;
typedef struct {
uint64_t x, y;
} b128_t;
#define b128_lt(a, b) ((a).x < (b).x)
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
#define b128_hash(a) ((uint32_t)(a).x) #define b128_hash(a) ((uint32_t)(a).x)
#include "khash.h" #include "khash.h"
KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
#include "ksort.h"
KSORT_INIT(b128, b128_t, b128_lt)
KSORT_INIT_GENERIC(uint64_t)
typedef struct { typedef struct {
kvec_t(b128_t) arr; pair64_v arr;
kvec_t(b128_t) pos[2]; pair64_v pos[2];
kvec_t(bwt_aln1_t) aln[2]; kvec_t(bwt_aln1_t) aln[2];
} pe_data_t; } pe_data_t;
@ -120,7 +111,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
free(isizes); free(isizes);
return -1; return -1;
} }
ks_introsort(uint64_t, tot, isizes); ks_introsort_64(tot, isizes);
p25 = isizes[(int)(tot*0.25 + 0.5)]; p25 = isizes[(int)(tot*0.25 + 0.5)];
p50 = isizes[(int)(tot*0.50 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)];
p75 = isizes[(int)(tot*0.75 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)];
@ -170,7 +161,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
{ {
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
uint64_t o_score, subo_score; uint64_t o_score, subo_score;
b128_t last_pos[2][2], o_pos[2]; pair64_t last_pos[2][2], o_pos[2];
max_len = p[0]->full_len; max_len = p[0]->full_len;
if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len;
if (low_bound < max_len) low_bound = max_len; if (low_bound < max_len) low_bound = max_len;
@ -206,11 +197,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
o_score = subo_score = (uint64_t)-1; o_score = subo_score = (uint64_t)-1;
o_n = subo_n = 0; o_n = subo_n = 0;
ks_introsort(b128, d->arr.n, d->arr.a); ks_introsort_128(d->arr.n, d->arr.a);
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
if (opt->type == BWA_PET_STD) { if (opt->type == BWA_PET_STD) {
for (i = 0; i < d->arr.n; ++i) { for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i]; pair64_t x = d->arr.a[i];
int strand = x.y>>1&1; int strand = x.y>>1&1;
if (strand == 1) { // reverse strand, then check if (strand == 1) { // reverse strand, then check
int y = 1 - (x.y&1); int y = 1 - (x.y&1);
@ -223,7 +214,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
} }
} else if (opt->type == BWA_PET_SOLID) { } else if (opt->type == BWA_PET_SOLID) {
for (i = 0; i < d->arr.n; ++i) { for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i]; pair64_t x = d->arr.a[i];
int strand = x.y>>1&1; int strand = x.y>>1&1;
if ((strand^x.y)&1) { // push if ((strand^x.y)&1) { // push
int y = 1 - (x.y&1); int y = 1 - (x.y&1);
@ -345,7 +336,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
{ // only when both ends mapped { // only when both ends mapped
b128_t x; pair64_t x;
int j, k; int j, k;
long long n_occ[2]; long long n_occ[2];
for (j = 0; j < 2; ++j) { for (j = 0; j < 2; ++j) {
@ -360,7 +351,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
bwt_aln1_t *r = d->aln[j].a + k; bwt_aln1_t *r = d->aln[j].a + k;
bwtint_t l; bwtint_t l;
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
b128_t key; pair64_t key;
int ret; int ret;
key.x = r->k; key.y = r->l; key.x = r->k; key.y = r->l;
khint_t iter = kh_put(b128, g_hash, key, &ret); khint_t iter = kh_put(b128, g_hash, key, &ret);
@ -377,14 +368,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
for (l = 0; l < kh_val(g_hash, iter).n; ++l) { for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x.x = kh_val(g_hash, iter).a[l]>>1; x.x = kh_val(g_hash, iter).a[l]>>1;
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
kv_push(b128_t, d->arr, x); kv_push(pair64_t, d->arr, x);
} }
} else { // then calculate on the fly } else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) { for (l = r->k; l <= r->l; ++l) {
int strand; int strand;
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
x.y = k<<2 | strand<<1 | j; x.y = k<<2 | strand<<1 | j;
kv_push(b128_t, d->arr, x); kv_push(pair64_t, d->arr, x);
} }
} }
} }

View File

@ -6,6 +6,7 @@
#include "bntseq.h" #include "bntseq.h"
#include "bwtsw2.h" #include "bwtsw2.h"
#include "kstring.h" #include "kstring.h"
#include "utils.h"
#ifndef _NO_SSE2 #ifndef _NO_SSE2
#include "ksw.h" #include "ksw.h"
#else #else
@ -24,7 +25,6 @@ typedef struct {
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
{ {
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
int i, k, x, p25, p50, p75, tmp, max_len = 0; int i, k, x, p25, p50, p75, tmp, max_len = 0;
uint64_t *isize; uint64_t *isize;
bsw2pestat_t r; bsw2pestat_t r;
@ -44,7 +44,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
isize[k++] = l; isize[k++] = l;
} }
ks_introsort_uint64_t(k, isize); ks_introsort_64(k, isize);
p25 = isize[(int)(.25 * k + .499)]; p25 = isize[(int)(.25 * k + .499)];
p50 = isize[(int)(.50 * k + .499)]; p50 = isize[(int)(.50 * k + .499)];
p75 = isize[(int)(.75 * k + .499)]; p75 = isize[(int)(.75 * k + .499)];

282
khash.h
View File

@ -1,6 +1,6 @@
/* The MIT License /* The MIT License
Copyright (c) 2008, 2009 by attractor <attractor@live.co.uk> Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the a copy of this software and associated documentation files (the
@ -33,7 +33,6 @@ int main() {
khiter_t k; khiter_t k;
khash_t(32) *h = kh_init(32); khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret); k = kh_put(32, h, 5, &ret);
if (!ret) kh_del(32, h, k);
kh_value(h, k) = 10; kh_value(h, k) = 10;
k = kh_get(32, h, 10); k = kh_get(32, h, 10);
is_missing = (k == kh_end(h)); is_missing = (k == kh_end(h));
@ -47,6 +46,29 @@ int main() {
*/ */
/* /*
2011-12-29 (0.2.7):
* Minor code clean up; no actual effect.
2011-09-16 (0.2.6):
* The capacity is a power of 2. This seems to dramatically improve the
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- http://code.google.com/p/ulib/
- http://nothings.org/computer/judy/
* Allow to optionally use linear probing which usually has better
performance for random input. Double hashing is still the default as it
is more robust to certain non-random input.
* Added Wang's integer hash function (not used by default). This hash
function is more robust to certain non-random input.
2011-02-14 (0.2.5):
* Allow to declare global functions.
2009-09-26 (0.2.4): 2009-09-26 (0.2.4):
* Improve portability * Improve portability
@ -86,11 +108,9 @@ int main() {
@header @header
Generic hash table library. Generic hash table library.
@copyright Heng Li
*/ */
#define AC_VERSION_KHASH_H "0.2.4" #define AC_VERSION_KHASH_H "0.2.6"
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -111,24 +131,14 @@ typedef unsigned long long khint64_t;
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
#define inline __inline #define kh_inline __inline
#else
#define kh_inline inline
#endif #endif
typedef khint32_t khint_t; typedef khint32_t khint_t;
typedef khint_t khiter_t; typedef khint_t khiter_t;
#define __ac_HASH_PRIME_SIZE 32
static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
{
0ul, 3ul, 11ul, 23ul, 53ul,
97ul, 193ul, 389ul, 769ul, 1543ul,
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
3221225473ul, 4294967291ul
};
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
@ -137,88 +147,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
#endif
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#ifndef kcalloc
#define kcalloc(N,Z) calloc(N,Z)
#endif
#ifndef kmalloc
#define kmalloc(Z) malloc(Z)
#endif
#ifndef krealloc
#define krealloc(P,Z) realloc(P,Z)
#endif
#ifndef kfree
#define kfree(P) free(P)
#endif
static const double __ac_HASH_UPPER = 0.77; static const double __ac_HASH_UPPER = 0.77;
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ #define __KHASH_TYPE(name, khkey_t, khval_t) \
typedef struct { \ typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \ khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \ khint32_t *flags; \
khkey_t *keys; \ khkey_t *keys; \
khval_t *vals; \ khval_t *vals; \
} kh_##name##_t; \ } kh_##name##_t;
static inline kh_##name##_t *kh_init_##name() { \
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
extern kh_##name##_t *kh_init_##name(void); \
extern void kh_destroy_##name(kh_##name##_t *h); \
extern void kh_clear_##name(kh_##name##_t *h); \
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
SCOPE kh_##name##_t *kh_init_##name(void) { \
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
} \ } \
static inline void kh_destroy_##name(kh_##name##_t *h) \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \ { \
if (h) { \ if (h) { \
free(h->keys); free(h->flags); \ kfree((void *)h->keys); kfree(h->flags); \
free(h->vals); \ kfree((void *)h->vals); \
free(h); \ kfree(h); \
} \ } \
} \ } \
static inline void kh_clear_##name(kh_##name##_t *h) \ SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \ { \
if (h && h->flags) { \ if (h && h->flags) { \
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \ h->size = h->n_occupied = 0; \
} \ } \
} \ } \
static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \ { \
if (h->n_buckets) { \ if (h->n_buckets) { \
khint_t inc, k, i, last; \ khint_t inc, k, i, last, mask; \
k = __hash_func(key); i = k % h->n_buckets; \ mask = h->n_buckets - 1; \
inc = 1 + k % (h->n_buckets - 1); last = i; \ k = __hash_func(key); i = k & mask; \
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ i = (i + inc) & mask; \
else i += inc; \
if (i == last) return h->n_buckets; \ if (i == last) return h->n_buckets; \
} \ } \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \ } else return 0; \
} \ } \
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \ khint32_t *new_flags = 0; \
khint_t j = 1; \ khint_t j = 1; \
{ \ { \
khint_t t = __ac_HASH_PRIME_SIZE - 1; \ kroundup32(new_n_buckets); \
while (__ac_prime_list[t] > new_n_buckets) --t; \ if (new_n_buckets < 4) new_n_buckets = 4; \
new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { /* hash table size to be changed (shrink or expand); rehash */ \
else { \ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (!new_flags) return -1; \
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { \ if (h->n_buckets < new_n_buckets) { /* expand */ \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \ if (!new_keys) return -1; \
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ h->keys = new_keys; \
} \ if (kh_is_map) { \
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
if (!new_vals) return -1; \
h->vals = new_vals; \
} \
} /* otherwise shrink */ \
} \ } \
} \ } \
if (j) { \ if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \ for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \ if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \ khkey_t key = h->keys[j]; \
khval_t val; \ khval_t val; \
khint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \ if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \ __ac_set_isdel_true(h->flags, j); \
while (1) { \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khint_t inc, k, i; \ khint_t inc, k, i; \
k = __hash_func(key); \ k = __hash_func(key); \
i = k % new_n_buckets; \ i = k & new_mask; \
inc = 1 + k % (new_n_buckets - 1); \ inc = __ac_inc(k, new_mask); \
while (!__ac_isempty(new_flags, i)) { \ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
else i += inc; \
} \
__ac_set_isempty_false(new_flags, i); \ __ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isdel_true(h->flags, i); \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { \ } else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \ h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \ if (kh_is_map) h->vals[i] = val; \
break; \ break; \
@ -226,35 +276,39 @@ static const double __ac_HASH_UPPER = 0.77;
} \ } \
} \ } \
} \ } \
if (h->n_buckets > new_n_buckets) { \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
} \ } \
free(h->flags); \ kfree(h->flags); /* free the working space */ \
h->flags = new_flags; \ h->flags = new_flags; \
h->n_buckets = new_n_buckets; \ h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \ h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \ } \
return 0; \
} \ } \
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \ { \
khint_t x; \ khint_t x; \
if (h->n_occupied >= h->upper_bound) { \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ if (h->n_buckets > (h->size<<1)) { \
else kh_resize_##name(h, h->n_buckets + 1); \ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
} \ *ret = -1; return h->n_buckets; \
} \
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
*ret = -1; return h->n_buckets; \
} \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \ { \
khint_t inc, k, i, site, last; \ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \ else { \
inc = 1 + k % (h->n_buckets - 1); last = i; \ inc = __ac_inc(k, mask); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \ if (__ac_isdel(h->flags, i)) site = i; \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ i = (i + inc) & mask; \
else i += inc; \
if (i == last) { x = site; break; } \ if (i == last) { x = site; break; } \
} \ } \
if (x == h->n_buckets) { \ if (x == h->n_buckets) { \
@ -263,20 +317,20 @@ static const double __ac_HASH_UPPER = 0.77;
} \ } \
} \ } \
} \ } \
if (__ac_isempty(h->flags, x)) { \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \ h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \ __ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \ ++h->size; ++h->n_occupied; \
*ret = 1; \ *ret = 1; \
} else if (__ac_isdel(h->flags, x)) { \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \ h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \ __ac_set_isboth_false(h->flags, x); \
++h->size; \ ++h->size; \
*ret = 2; \ *ret = 2; \
} else *ret = 0; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \ return x; \
} \ } \
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \ { \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \ __ac_set_isdel_true(h->flags, x); \
@ -284,6 +338,17 @@ static const double __ac_HASH_UPPER = 0.77;
} \ } \
} }
#define KHASH_DECLARE(name, khkey_t, khval_t) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */ /* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function /*! @function
@ -311,10 +376,10 @@ static const double __ac_HASH_UPPER = 0.77;
@param s Pointer to a null terminated string @param s Pointer to a null terminated string
@return The hash value @return The hash value
*/ */
static inline khint_t __ac_X31_hash_string(const char *s) static kh_inline khint_t __ac_X31_hash_string(const char *s)
{ {
khint_t h = *s; khint_t h = (khint_t)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h; return h;
} }
/*! @function /*! @function
@ -328,9 +393,21 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/ */
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
/* --- END OF HASH FUNCTIONS --- */ /* --- END OF HASH FUNCTIONS --- */
/* Other necessary macros... */ /* Other convenient macros... */
/*! /*!
@abstract Type of the hash table. @abstract Type of the hash table.
@ -396,7 +473,6 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/ */
#define kh_del(name, h, k) kh_del_##name(h, k) #define kh_del(name, h, k) kh_del_##name(h, k)
/*! @function /*! @function
@abstract Test whether a bucket contains data. @abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*] @param h Pointer to the hash table [khash_t(name)*]
@ -455,6 +531,34 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/ */
#define kh_n_buckets(h) ((h)->n_buckets) #define kh_n_buckets(h) ((h)->n_buckets)
/*! @function
@abstract Iterate over the entries in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param kvar Variable to which key will be assigned
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(kvar) = kh_key(h,__i); \
(vvar) = kh_val(h,__i); \
code; \
} }
/*! @function
@abstract Iterate over the values in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(vvar) = kh_val(h,__i); \
code; \
} }
/* More conenient interfaces */ /* More conenient interfaces */
/*! @function /*! @function

79
utils.c
View File

@ -35,6 +35,12 @@
#include <sys/time.h> #include <sys/time.h>
#include "utils.h" #include "utils.h"
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
#include "ksort.h"
KSORT_INIT(128, pair64_t, pair64_lt)
KSORT_INIT(64, uint64_t, ks_lt_generic)
FILE *err_xopen_core(const char *func, const char *fn, const char *mode) FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{ {
FILE *fp = 0; FILE *fp = 0;
@ -46,6 +52,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
} }
return fp; return fp;
} }
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
{ {
if (freopen(fn, mode, fp) == 0) { if (freopen(fn, mode, fp) == 0) {
@ -56,6 +63,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE
} }
return fp; return fp;
} }
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
{ {
gzFile fp; gzFile fp;
@ -67,6 +75,7 @@ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
} }
return fp; return fp;
} }
void err_fatal(const char *header, const char *fmt, ...) void err_fatal(const char *header, const char *fmt, ...)
{ {
va_list args; va_list args;
@ -86,66 +95,48 @@ void err_fatal_simple_core(const char *func, const char *msg)
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{ {
size_t ret = fwrite(ptr, size, nmemb, stream); size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb) if (ret != nmemb)
{ err_fatal_simple_core("fwrite", strerror(errno));
err_fatal_simple_core("fwrite", strerror(errno)); return ret;
}
return ret;
} }
int err_printf(const char *format, ...) int err_printf(const char *format, ...)
{ {
va_list arg; va_list arg;
int done; int done;
va_start(arg, format);
va_start(arg, format); done = vfprintf(stdout, format, arg);
done = vfprintf(stdout, format, arg); int saveErrno = errno;
int saveErrno = errno; va_end(arg);
va_end(arg); if (done < 0) err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
return done;
if (done < 0)
{
err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
}
return done;
} }
int err_fprintf(FILE *stream, const char *format, ...) int err_fprintf(FILE *stream, const char *format, ...)
{ {
va_list arg; va_list arg;
int done; int done;
va_start(arg, format);
va_start(arg, format); done = vfprintf(stream, format, arg);
done = vfprintf(stream, format, arg); int saveErrno = errno;
int saveErrno = errno; va_end(arg);
va_end(arg); if (done < 0) err_fatal_simple_core("vfprintf", strerror(saveErrno));
return done;
if (done < 0)
{
err_fatal_simple_core("vfprintf", strerror(saveErrno));
}
return done;
} }
int err_fflush(FILE *stream) int err_fflush(FILE *stream)
{ {
int ret = fflush(stream); int ret = fflush(stream);
if (ret != 0) if (ret != 0) err_fatal_simple_core("fflush", strerror(errno));
{ return ret;
err_fatal_simple_core("fflush", strerror(errno));
}
return ret;
} }
int err_fclose(FILE *stream) int err_fclose(FILE *stream)
{ {
int ret = fclose(stream); int ret = fclose(stream);
if (ret != 0) if (ret != 0) err_fatal_simple_core("fclose", strerror(errno));
{ return ret;
err_fatal_simple_core("fclose", strerror(errno));
}
return ret;
} }
double cputime() double cputime()

13
utils.h
View File

@ -28,6 +28,7 @@
#ifndef LH3_UTILS_H #ifndef LH3_UTILS_H
#define LH3_UTILS_H #define LH3_UTILS_H
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <zlib.h> #include <zlib.h>
@ -38,14 +39,19 @@
#define ATTRIBUTE(list) #define ATTRIBUTE(list)
#endif #endif
#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg)
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) #define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg)
typedef struct {
uint64_t x, y;
} pair64_t;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -66,6 +72,9 @@ extern "C" {
double cputime(); double cputime();
double realtime(); double realtime();
void ks_introsort_64 (size_t n, uint64_t *a);
void ks_introsort_128(size_t n, pair64_t *a);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif