diff --git a/bseq.c b/bseq.c new file mode 100644 index 0000000..3cde6c8 --- /dev/null +++ b/bseq.c @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include +#include "bseq.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char seq_nt4_table[256]; + +struct bseq_file_s { + int is_eof; + gzFile fp; + kseq_t *ks; +}; + +bseq_file_t *bseq_open(const char *fn) +{ + bseq_file_t *fp; + gzFile f; + f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (f == 0) return 0; + fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t)); + fp->fp = f; + fp->ks = kseq_init(fp->fp); + return fp; +} + +void bseq_close(bseq_file_t *fp) +{ + kseq_destroy(fp->ks); + gzclose(fp->fp); + free(fp); +} + +bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_) +{ + int size = 0, m, n; + bseq1_t *seqs; + kseq_t *ks = fp->ks; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + bseq1_t *s; + assert(ks->seq.l <= INT32_MAX); + if (n >= m) { + m = m? m<<1 : 256; + seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t)); + } + s = &seqs[n]; + s->name = strdup(ks->name.s); + s->seq = strdup(ks->seq.s); + s->l_seq = ks->seq.l; + size += seqs[n++].l_seq; + if (size >= chunk_size) break; + } + if (n == 0) fp->is_eof = 1; + *n_ = n; + return seqs; +} + +int bseq_eof(bseq_file_t *fp) +{ + return fp->is_eof; +} diff --git a/bseq.h b/bseq.h new file mode 100644 index 0000000..daf1fc5 --- /dev/null +++ b/bseq.h @@ -0,0 +1,19 @@ +#ifndef MM_BSEQ_H +#define MM_BSEQ_H + +#include + +struct bseq_file_s; +typedef struct bseq_file_s bseq_file_t; + +typedef struct { + int l_seq, rid; + char *name, *seq; +} bseq1_t; + +bseq_file_t *bseq_open(const char *fn); +void bseq_close(bseq_file_t *fp); +bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_); +int bseq_eof(bseq_file_t *fp); + +#endif diff --git a/kalloc.c b/kalloc.c new file mode 100644 index 0000000..2def1bb --- /dev/null +++ b/kalloc.c @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include "kalloc.h" + +/* The whole thing is: ("@" for the kheader_t of the block, "-" for free + * memory, and "+" for allocated memory. One char for one unit.) + * + * This region is core 1. This region is core 2. + * + * @-------@++++++@++++++++++++@------------ @----------@++++++++++++@+++++++@------------ + * | | | | + * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr + */ + +#define PTR(p) ((size_t*)((size_t*)p)[1]) + +typedef struct _allocated_t { + struct _allocated_t *next; + size_t *ptr; +} allocated_t; + +typedef struct { + size_t base[2], *loop_head; + allocated_t list_head, *list_tail; + size_t total_allocated; +} kmem_t; + +void *km_init() +{ + return calloc(1, sizeof(kmem_t)); +} + +static void kerror(const char *s) +{ + fprintf(stderr, "%s\n", s); + exit(1); +} + +static size_t *morecore(kmem_t *km, size_t nu) +{ + size_t rnu, *up; + + rnu = (nu + 0xfffff) & (~(size_t)0xfffff); + up = (size_t*)malloc(rnu * sizeof(size_t)); + if (!up) { /* fail to allocate memory */ + km_stat(km); + fprintf(stderr, "[morecore] %lu bytes requested but not available.\n", rnu * sizeof(size_t)); + exit(1); + } + /* put the pointer in km->list_head */ + if (km->list_tail == 0) km->list_tail = &km->list_head; + km->list_tail->ptr = up; + km->list_tail->next = (allocated_t*)calloc(1, sizeof(allocated_t)); + km->list_tail = km->list_tail->next; + + km->total_allocated += rnu * sizeof(size_t); + *up = rnu; /* the size of the current block, and in this case the block is the same as the new core */ + kfree(km, up + 1); /* initialize the new "core" */ + return km->loop_head; +} + +void km_destroy(void *_km) +{ + kmem_t *km = (kmem_t*)_km; + allocated_t *p, *q; + if (km == 0) return; + p = &km->list_head; + do { + q = p->next; + free(p->ptr); + if (p != &km->list_head) free(p); + p = q; + } while (p && p->next); + if (p != &km->list_head) free(p); + free(km); +} + +void kfree(void *_km, void *ap) +{ + size_t *p, *q; + kmem_t *km = (kmem_t*)_km; + + if (!ap) return; + if (km == 0) { + free(ap); + return; + } + p = (size_t*)ap - 1; /* *p is the size of the current block */ + /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: + * + * a) "p>q && pptr": @------@++++++++@+++++++@------- @---------------@+++++++@------- + * (can also be in | | | -> | | + * two cores) q p q->ptr q q->ptr + * + * @-------- @+++++++++@-------- @-------- @------------------ + * | | | -> | | + * q p q->ptr q q->ptr + * + * b) "q>=q->ptr && (p>q || pptr)": @-------@+++++ @--------@+++++++ @-------@+++++ @---------------- + * | | | -> | | + * q->ptr q p q->ptr q + * + * @+++++++@----- @++++++++@------- @------------- @++++++++@------- + * | | | -> | | + * p q->ptr q q->ptr q + */ + for (q = km->loop_head; !(p > q && p < PTR(q)); q = PTR(q)) + if (q >= PTR(q) && (p > q || p < PTR(q))) break; + if (p + (*p) == PTR(q)) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ + *p += *PTR(q); /* this is the new q->ptr size */ + p[1] = (size_t)PTR(PTR(q)); /* this is the new q->ptr->ptr */ + /* p is actually the new q->ptr. The actual change happens a few lines below. */ + } else if (p + (*p) > PTR(q) && PTR(q) >= p) { /* the end of the allocated block is in the next free block */ + kerror("[kfree] The end of the allocated block enters a free block."); + } else p[1] = (size_t)PTR(q); /* backup q->ptr */ + + if (q + (*q) == p) { /* two adjacent blocks, merge q and p (the other two cases) */ + *q += *p; + q[1] = (size_t)PTR(p); + km->loop_head = q; + } else if (q + (*q) > p && p >= q) { /* the end of a free block in the allocated block */ + kerror("[kfree] The end of a free block enters the allocated block."); + } else km->loop_head = p, q[1] = (size_t)p; /* in two cores, cannot be merged */ +} + +#define round_up(v) (--(v), (v)|=(v)>>1, (v)|=(v)>>2, (v)|=(v)>>4, (v)|=(v)>>8, (v)|=(v)>>16, (v)|=(v)>>32, ++(v)) + +void *krealloc(void *_km, void *ap, size_t n_bytes) +{ + kmem_t *km = (kmem_t*)_km; + size_t n_units, *p, *q; + + if (n_bytes == 0) { + kfree(km, ap); return 0; + } + if (km == 0) return realloc(ap, n_bytes); + if (!ap) return kmalloc(km, n_bytes); + n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t); + p = (size_t*)ap - 1; + if (*p >= n_units) return ap; + q = kmalloc(km, n_bytes); + if (q != ap) memcpy(q, ap, (*p - 1) * sizeof(size_t)); + kfree(km, ap); + return q; +} + +void *kmalloc(void *_km, size_t n_bytes) +{ + kmem_t *km = (kmem_t*)_km; + size_t n_units, *p, *q; + + if (n_bytes == 0) return 0; + if (km == 0) return malloc(n_bytes); + /* "n_units" means the number of units. The size of one unit equals to sizeof(kheader_t). + * "1" is the kheader_t of a block, which is always required. */ + n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t); + round_up(n_units); + + if (!(q = km->loop_head)) { /* the first time when kmalloc() is called, intialization */ + km->base[1] = (size_t)(km->loop_head = q = km->base); *q = 0; + } + for (p = PTR(q);; q = p, p = PTR(p)) { /* search for a suitable block */ + if (*p >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ + if (*p == n_units) q[1] = (size_t)PTR(p); /* no need to split the block */ + else { /* split the block */ + /* memory is allocated at the end of the block */ + *p -= n_units; /* reduce the size of the free block */ + p += *p; /* skip to the kheader_t of the allocated block */ + *p = n_units; /* set the size */ + } + km->loop_head = q; /* set the end of chain */ + return p + 1; /* skip the kheader_t */ + } + if (p == km->loop_head) { /* then ask for more "cores" */ + if ((p = morecore(km, n_units)) == 0) return 0; + } + } +} + +void *kcalloc(void *_km, size_t count, size_t size) +{ + kmem_t *km = (kmem_t*)_km; + char *p; + if (size == 0 || count == 0) return 0; + if (km == 0) return calloc(count, size); + p = kmalloc(km, count * size); + memset(p, 0, count * size); + return p; +} + +void km_stat(const void *_km) +{ + kmem_t *km = (kmem_t*)_km; + unsigned n_blocks, n_units; + size_t max_block = 0, *p, *q; + float frag; + + if (km == 0 || !(p = km->loop_head)) return; + n_blocks = n_units = 0; + do { + q = PTR(p); + if (*p > max_block) max_block = *p; + n_units += *p; + if (p + (*p) > q && q > p) + kerror("[kr_stat] The end of a free block enters another free block."); + p = q; + ++n_blocks; + } while (p != km->loop_head); + + --n_blocks; + frag = 1.0/1024.0 * n_units * sizeof(size_t) / n_blocks; + fprintf(stderr, "[kr_stat] tot=%lu, free=%lu, n_block=%u, max_block=%lu, frag_len=%.3fK\n", + km->total_allocated, n_units * sizeof(size_t), n_blocks, max_block * sizeof(size_t), frag); +} diff --git a/kalloc.h b/kalloc.h new file mode 100644 index 0000000..ec683d7 --- /dev/null +++ b/kalloc.h @@ -0,0 +1,26 @@ +#ifndef _KALLOC_H_ +#define _KALLOC_H_ + +#include + +#define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t)) + +#ifdef __cplusplus +extern "C" { +#endif + +void *kmalloc(void *km, size_t size); +void *krealloc(void *km, void *ptr, size_t size); +void *kcalloc(void *km, size_t count, size_t size); +void kfree(void *km, void *ptr); + +void *km_init(void); +void km_destroy(void *km); + +void km_stat(const void *km); // TODO: return numbers instead of print to stderr + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..d301ddc --- /dev/null +++ b/kseq.h @@ -0,0 +1,248 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + int begin, end; \ + int is_eof:2, bufsize:30; \ + type_t f; \ + unsigned char *buf; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(SCOPE, type_t, __bufsize) \ + SCOPE kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; ks->bufsize = __bufsize; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + SCOPE void ks_destroy(kstream_t *ks) \ + { \ + if (!ks) return; \ + free(ks->buf); \ + free(ks); \ + } + +#define __KS_INLINED(__read) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + unsigned l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(SCOPE, __read) \ + SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(SCOPE, type_t, __bufsize) \ + __KS_GETUNTIL(SCOPE, __read) \ + __KS_INLINED(__read) + +#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + +#define KSTREAM_DECLARE(type_t, __read) \ + __KS_TYPE(type_t) \ + extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern kstream_t *ks_init(type_t f); \ + extern void ks_destroy(kstream_t *ks); \ + __KS_INLINED(__read) + +/****************** + * FASTA/Q parser * + ******************/ + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/kvec.h b/kvec.h new file mode 100644 index 0000000..e865173 --- /dev/null +++ b/kvec.h @@ -0,0 +1,105 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include +#include "kalloc.h" + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, km, v, s) do { \ + if ((v).m < (s)) { \ + (v).m = (s); \ + kv_roundup32((v).m); \ + (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ + } \ + } while (0) + +#define kv_copy(type, km, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, km, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, km, v, p) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \ + } \ + *(p) = &(v).a[(v).n++]; \ + } while (0) + +#define kv_reverse(type, v, start) do { \ + if ((v).m > 0 && (v).n > (start)) { \ + size_t __i, __end = (v).n - (start); \ + type *__a = (v).a + (start); \ + for (__i = 0; __i < __end>>1; ++__i) { \ + type __t = __a[__end - 1 - __i]; \ + __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ + } \ + } \ + } while (0) + +#endif diff --git a/minimap.h b/minimap.h new file mode 100644 index 0000000..1376968 --- /dev/null +++ b/minimap.h @@ -0,0 +1,104 @@ +#ifndef MINIMAP_H +#define MINIMAP_H + +#include +#include +#include +#include "bseq.h" + +#define MM_IDX_DEF_B 14 +#define MM_DEREP_Q50 5.0 + +#define MM_F_WITH_REP 0x1 +#define MM_F_NO_SELF 0x2 +#define MM_F_NO_ISO 0x4 +#define MM_F_AVA 0x8 + +typedef struct { + uint64_t x, y; +} mm128_t; + +typedef struct { size_t n, m; mm128_t *a; } mm128_v; +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; uint32_t *a; } uint32_v; + +typedef struct { + mm128_v a; // (minimizer, position) array + int32_t n; // size of the _p_ array + uint64_t *p; // position array for minimizers appearing >1 times + void *h; // hash table indexing _p_ and minimizers appearing once +} mm_idx_bucket_t; + +typedef struct { + int b, w, k; + uint32_t n; // number of reference sequences + mm_idx_bucket_t *B; + uint32_t max_occ; + float freq_thres; + int32_t *len; // length of each reference sequence + char **name; // TODO: if this uses too much RAM, switch to one concatenated string +} mm_idx_t; + +typedef struct { + uint32_t cnt:31, rev:1; + uint32_t rid:31, rep:1; + uint32_t len; + int32_t qs, qe, rs, re; +} mm_reg1_t; + +typedef struct { + int radius; // bandwidth to cluster hits + int max_gap; // break a chain if there are no minimizers in a max_gap window + int min_cnt; // minimum number of minimizers to start a chain + int min_match; + int sdust_thres; // score threshold for SDUST; 0 to disable + int flag; // see MM_F_* macros + float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains +} mm_mapopt_t; + +extern int mm_verbose; +extern double mm_realtime0; + +struct mm_tbuf_s; +typedef struct mm_tbuf_s mm_tbuf_t; + +#ifdef __cplusplus +extern "C" { +#endif + +// compute minimizers +void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); + +// minimizer indexing +mm_idx_t *mm_idx_init(int w, int k, int b); +void mm_idx_destroy(mm_idx_t *mi); +mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name); +void mm_idx_set_max_occ(mm_idx_t *mi, float f); +const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); + +mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads); + +// minimizer index I/O +void mm_idx_dump(FILE *fp, const mm_idx_t *mi); +mm_idx_t *mm_idx_load(FILE *fp); + +// mapping +void mm_mapopt_init(mm_mapopt_t *opt); +mm_tbuf_t *mm_tbuf_init(void); +void mm_tbuf_destroy(mm_tbuf_t *b); +const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); + +int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size); + +// private functions (may be moved to a "mmpriv.h" in future) +double cputime(void); +double realtime(void); +void radix_sort_128x(mm128_t *beg, mm128_t *end); +void radix_sort_64(uint64_t *beg, uint64_t *end); +uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sketch.c b/sketch.c index 04fbb4c..c5da0d9 100644 --- a/sketch.c +++ b/sketch.c @@ -36,67 +36,108 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask) return key; } +typedef struct { // a simplified version of kdq + int front, count; + int a[32]; +} tiny_queue_t; + +static inline void tq_push(tiny_queue_t *q, int x) +{ + q->a[((q->count++) + q->front) & 0x1f] = x; +} + +static inline int tq_shift(tiny_queue_t *q) +{ + int x; + if (q->count == 0) return -1; + x = q->a[q->front++]; + q->front &= 0x1f; + --q->count; + return x; +} + /** * Find symmetric (w,k)-minimizers on a DNA sequence * + * @param km thread-local memory pool; using NULL falls back to malloc() * @param str DNA sequence * @param len length of $str * @param w find a minimizer for every $w consecutive k-mers * @param k k-mer size * @param rid reference ID; will be copied to the output $p array - * @param p minimizers; p->a[i].x is the 2k-bit hash value; + * @param is_hpc homopolymer-compressed or not + * @param p minimizers + * p->a[i].x = kMer<<8 | kmerSpan * p->a[i].y = rid<<32 | lastPos<<1 | strand * where lastPos is the position of the last base of the i-th minimizer, * and strand indicates whether the minimizer comes from the top or the bottom strand. * Callers may want to set "p->n = 0"; otherwise results are appended to p */ -void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p) +void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p) { uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0}; - int i, j, l, buf_pos, min_pos; + int i, j, l, buf_pos, min_pos, kmer_span = 0; mm128_t *buf, min = { UINT64_MAX, UINT64_MAX }; + tiny_queue_t tq; - assert(len > 0 && w > 0 && k > 0); + assert(len > 0 && w > 0 && k > 0 && k <= 28); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice buf = (mm128_t*)alloca(w * 16); memset(buf, 0xff, w * 16); + memset(&tq, 0, sizeof(tiny_queue_t)); + kv_resize(mm128_t, km, *p, p->n + len/w); for (i = l = buf_pos = min_pos = 0; i < len; ++i) { int c = seq_nt4_table[(uint8_t)str[i]]; mm128_t info = { UINT64_MAX, UINT64_MAX }; if (c < 4) { // not an ambiguous base int z; + if (is_hpc) { + int skip_len = 1; + if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { + for (skip_len = 2; i + skip_len < len; ++skip_len) + if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) + break; + i += skip_len - 1; // put $i at the end of the current homopolymer run + } + tq_push(&tq, skip_len); + kmer_span += skip_len; + if (tq.count > k) kmer_span -= tq_shift(&tq); + if (kmer_span >= 256) continue; // make sure $kmer_span does not take more than 8 bits + } else kmer_span = l + 1 < k? l + 1 : k; kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand z = kmer[0] < kmer[1]? 0 : 1; // strand - if (++l >= k) - info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; - } else l = 0; + if (++l >= k) { + info.x = hash64(kmer[z], mask) << 8 | kmer_span; + info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; + } + } else l = 0, tq.count = tq.front = 0; buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet for (j = buf_pos + 1; j < w; ++j) - if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); + if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]); for (j = 0; j < buf_pos; ++j) - if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); + if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]); } if (info.x <= min.x) { // a new minimum; then write the old min - if (l >= w + k) kv_push(mm128_t, *p, min); + if (l >= w + k) kv_push(mm128_t, km, *p, min); min = info, min_pos = buf_pos; } else if (buf_pos == min_pos) { // old min has moved outside the window - if (l >= w + k - 1) kv_push(mm128_t, *p, min); + if (l >= w + k - 1) kv_push(mm128_t, km, *p, min); for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer for (j = 0; j <= buf_pos; ++j) if (min.x >= buf[j].x) min = buf[j], min_pos = j; if (l >= w + k - 1) { // write identical k-mers for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted - if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); + if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]); for (j = 0; j <= buf_pos; ++j) - if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); + if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]); } } if (++buf_pos == w) buf_pos = 0; } if (min.x != UINT64_MAX) - kv_push(mm128_t, *p, min); + kv_push(mm128_t, km, *p, min); }