From bacb7534d2e66014ccd3e6145e9a6ad7270ca7a3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Apr 2017 15:51:36 -0400 Subject: [PATCH] make sdust working with kalloc --- .gitignore | 4 + kdq.h | 131 ++++++++++++++++++++++++++++++++ sdust.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++ sdust.h | 23 ++++++ 4 files changed, 372 insertions(+) create mode 100644 .gitignore create mode 100644 kdq.h create mode 100644 sdust.c create mode 100644 sdust.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a8ecdc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.*.swp +*.a +*.o +*.dSYM diff --git a/kdq.h b/kdq.h new file mode 100644 index 0000000..4a67c0b --- /dev/null +++ b/kdq.h @@ -0,0 +1,131 @@ +#ifndef __AC_KDQ_H +#define __AC_KDQ_H + +#include +#include +#include "kalloc.h" + +#define __KDQ_TYPE(type) \ + typedef struct { \ + size_t front:58, bits:6, count, mask; \ + type *a; \ + void *km; \ + } kdq_##type##_t; + +#define kdq_t(type) kdq_##type##_t +#define kdq_size(q) ((q)->count) +#define kdq_first(q) ((q)->a[(q)->front]) +#define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) +#define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) + +#define __KDQ_IMPL(type, SCOPE) \ + SCOPE kdq_##type##_t *kdq_init_##type(void *km) \ + { \ + kdq_##type##_t *q; \ + q = (kdq_##type##_t*)kcalloc(km, 1, sizeof(kdq_##type##_t)); \ + q->bits = 2, q->mask = (1ULL<bits) - 1; \ + q->a = (type*)kmalloc(km, (1<bits) * sizeof(type)); \ + q->km = km; \ + return q; \ + } \ + SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ + { \ + if (q == 0) return; \ + kfree(q->km, q->a); kfree(q->km, q); \ + } \ + SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ + { \ + size_t new_size = 1ULL<bits; \ + if (new_size < q->count) { /* not big enough */ \ + int i; \ + for (i = 0; i < 64; ++i) \ + if (1ULL< q->count) break; \ + new_bits = i, new_size = 1ULL<bits) return q->bits; /* unchanged */ \ + if (new_bits > q->bits) q->a = (type*)krealloc(q->km, q->a, (1ULL<front + q->count <= old_size) { /* unwrapped */ \ + if (q->front + q->count > new_size) /* only happens for shrinking */ \ + memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ + } else { /* wrapped */ \ + memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ + q->front = new_size - (old_size - q->front); \ + } \ + q->bits = new_bits, q->mask = (1ULL<bits) - 1; \ + if (new_bits < q->bits) q->a = (type*)krealloc(q->km, q->a, (1ULL<bits; \ + } \ + SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + return &q->a[((q->count++) + q->front) & (q)->mask]; \ + } \ + SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + q->a[((q->count++) + q->front) & (q)->mask] = v; \ + } \ + SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ + { \ + if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ + ++q->count; \ + q->front = q->front? q->front - 1 : (1ULL<bits) - 1; \ + return &q->a[q->front]; \ + } \ + SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ + { \ + type *p; \ + p = kdq_unshiftp_##type(q); \ + *p = v; \ + } \ + SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ + { \ + return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ + } \ + SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ + { \ + type *d = 0; \ + if (q->count == 0) return 0; \ + d = &q->a[q->front++]; \ + q->front &= q->mask; \ + --q->count; \ + return d; \ + } + +#define KDQ_INIT2(type, SCOPE) \ + __KDQ_TYPE(type) \ + __KDQ_IMPL(type, SCOPE) + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) + +#define KDQ_DECLARE(type) \ + __KDQ_TYPE(type) \ + kdq_##type##_t *kdq_init_##type(); \ + void kdq_destroy_##type(kdq_##type##_t *q); \ + int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ + type *kdq_pushp_##type(kdq_##type##_t *q); \ + void kdq_push_##type(kdq_##type##_t *q, type v); \ + type *kdq_unshiftp_##type(kdq_##type##_t *q); \ + void kdq_unshift_##type(kdq_##type##_t *q, type v); \ + type *kdq_pop_##type(kdq_##type##_t *q); \ + type *kdq_shift_##type(kdq_##type##_t *q); + +#define kdq_init(type, km) kdq_init_##type(km) +#define kdq_destroy(type, q) kdq_destroy_##type(q) +#define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) +#define kdq_pushp(type, q) kdq_pushp_##type(q) +#define kdq_push(type, q, v) kdq_push_##type(q, v) +#define kdq_pop(type, q) kdq_pop_##type(q) +#define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) +#define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) +#define kdq_shift(type, q) kdq_shift_##type(q) + +#endif diff --git a/sdust.c b/sdust.c new file mode 100644 index 0000000..15b74b7 --- /dev/null +++ b/sdust.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include "kalloc.h" +#include "kdq.h" +#include "kvec.h" +#include "sdust.h" + +#define SD_WLEN 3 +#define SD_WTOT (1<<(SD_WLEN<<1)) +#define SD_WMSK (SD_WTOT - 1) + +typedef struct { + int start, finish; + int r, l; +} perf_intv_t; + +typedef kvec_t(perf_intv_t) perf_intv_v; +typedef kvec_t(uint64_t) uint64_v; + +KDQ_INIT(int) + +#if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN) +unsigned char seq_nt4_table[256] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; +#else +extern unsigned char seq_nt4_table[256]; +#endif + +struct sdust_buf_s { + kdq_t(int) *w; + perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish + uint64_v res; // the result + void *km; // memory pool +}; + +sdust_buf_t *sdust_buf_init(void) +{ + sdust_buf_t *buf; + void *km; + km = km_init(); + buf = (sdust_buf_t*)kcalloc(km, 1, sizeof(sdust_buf_t)); + buf->km = km; + buf->w = kdq_init(int, buf->km); + return buf; +} + +void sdust_buf_destroy(sdust_buf_t *buf) +{ + if (buf == 0) return; + if (buf->km == 0) { // then fall back to the default allocator for which we need free all arrays manually + kfree(buf->km, buf->P.a); kfree(buf->km, buf->res.a); kfree(buf->km, buf); + } else km_destroy(buf->km); // This deallocate all memory allocated from buf->km +} + +static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv) +{ + int s; + if (kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3? + s = *kdq_shift(int, w); + *rw -= --cw[s]; + if (*L > kdq_size(w)) + --*L, *rv -= --cv[s]; + } + kdq_push(int, w, t); + ++*L; + *rw += cw[t]++; + *rv += cv[t]++; + if (cv[t] * 10 > T<<1) { + do { + s = kdq_at(w, kdq_size(w) - *L); + *rv -= --cv[s]; + --*L; + } while (s != t); + } +} + +static inline void save_masked_regions(void *km, uint64_v *res, perf_intv_v *P, int start) +{ + int i, saved = 0; + perf_intv_t *p; + if (P->n == 0 || P->a[P->n - 1].start >= start) return; + p = &P->a[P->n - 1]; + if (res->n) { + int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1]; + if (p->start <= f) // if overlapping with or adjacent to the previous interval + saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish); + } + if (!saved) kv_push(uint64_t, km, *res, (uint64_t)p->start<<32|p->finish); + for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window + P->n = i + 1; +} + +static void find_perfect(void *km, perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv) +{ + int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0; + memcpy(c, cv, SD_WTOT * sizeof(int)); + for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) { + int j, t = kdq_at(w, i), new_r, new_l; + r += c[t]++; + new_r = r, new_l = kdq_size(w) - i - 1; + if (new_r * 10 > T * new_l) { + for (j = 0; j < P->n && P->a[j].start >= i + start; ++j) { // find insertion position + perf_intv_t *p = &P->a[j]; + if (max_r == 0 || p->r * max_l > max_r * p->l) + max_r = p->r, max_l = p->l; + } + if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert + max_r = new_r, max_l = new_l; + if (P->n == P->m) kv_resize(perf_intv_t, km, *P, P->n + 1); + memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room + ++P->n; + P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start; + P->a[j].r = new_r, P->a[j].l = new_l; + } + } + } +} + +const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf) +{ + int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT]; + int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence + unsigned t; // current word + + buf->P.n = buf->res.n = 0; + buf->w->front = buf->w->count = 0; + memset(cv, 0, SD_WTOT * sizeof(int)); + memset(cw, 0, SD_WTOT * sizeof(int)); + if (l_seq < 0) l_seq = strlen((const char*)seq); + for (i = l = t = 0; i <= l_seq; ++i) { + int b = i < l_seq? seq_nt4_table[seq[i]] : 4; + if (b < 4) { // an A/C/G/T base + ++l, t = (t<<2 | b) & SD_WMSK; + if (l >= SD_WLEN) { // we have seen a word + start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window + save_masked_regions(buf->km, &buf->res, &buf->P, start); // save intervals falling out of the current window? + shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv); + if (rw * 10 > L * T) + find_perfect(buf->km, &buf->P, buf->w, T, start, L, rv, cv); + } + } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences + start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l); + while (buf->P.n) save_masked_regions(buf->km, &buf->res, &buf->P, start++); // clear up unsaved perfect intervals + l = t = 0; + } + } + *n = buf->res.n; + return buf->res.a; +} + +uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n) +{ + uint64_t *ret; + sdust_buf_t *buf; + buf = sdust_buf_init(); + ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf); + buf->res.a = 0; + sdust_buf_destroy(buf); + return ret; +} + +#ifdef _SDUST_MAIN +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +int main(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *ks; + int W = 64, T = 20, c; + + while ((c = getopt(argc, argv, "w:t:")) >= 0) { + if (c == 'w') W = atoi(optarg); + else if (c == 't') T = atoi(optarg); + } + if (optind == argc) { + fprintf(stderr, "Usage: sdust [-w %d] [-t %d] \n", W, T); + return 1; + } + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + while (kseq_read(ks) >= 0) { + uint64_t *r; + int i, n; + r = sdust((uint8_t*)ks->seq.s, -1, T, W, &n); + for (i = 0; i < n; ++i) + printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]); + free(r); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} +#endif diff --git a/sdust.h b/sdust.h new file mode 100644 index 0000000..52a13ff --- /dev/null +++ b/sdust.h @@ -0,0 +1,23 @@ +#ifndef SDUST_H +#define SDUST_H + +struct sdust_buf_s; +typedef struct sdust_buf_s sdust_buf_t; + +#ifdef __cplusplus +extern "C" { +#endif + +// the simple interface +uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n); + +// the following interface dramatically reduce heap allocations when sdust is frequently called. +sdust_buf_t *sdust_buf_init(void); +void sdust_buf_destroy(sdust_buf_t *buf); +const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf); + +#ifdef __cplusplus +} +#endif + +#endif