Homopolymer-compressed k-mer sketch

This commit is contained in:
Heng Li 2017-04-06 15:37:34 -04:00
parent 2bb0839177
commit 01baa847a1
8 changed files with 838 additions and 14 deletions

65
bseq.c 100644
View File

@ -0,0 +1,65 @@
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "bseq.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
extern unsigned char seq_nt4_table[256];
struct bseq_file_s {
int is_eof;
gzFile fp;
kseq_t *ks;
};
bseq_file_t *bseq_open(const char *fn)
{
bseq_file_t *fp;
gzFile f;
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (f == 0) return 0;
fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t));
fp->fp = f;
fp->ks = kseq_init(fp->fp);
return fp;
}
void bseq_close(bseq_file_t *fp)
{
kseq_destroy(fp->ks);
gzclose(fp->fp);
free(fp);
}
bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_)
{
int size = 0, m, n;
bseq1_t *seqs;
kseq_t *ks = fp->ks;
m = n = 0; seqs = 0;
while (kseq_read(ks) >= 0) {
bseq1_t *s;
assert(ks->seq.l <= INT32_MAX);
if (n >= m) {
m = m? m<<1 : 256;
seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t));
}
s = &seqs[n];
s->name = strdup(ks->name.s);
s->seq = strdup(ks->seq.s);
s->l_seq = ks->seq.l;
size += seqs[n++].l_seq;
if (size >= chunk_size) break;
}
if (n == 0) fp->is_eof = 1;
*n_ = n;
return seqs;
}
int bseq_eof(bseq_file_t *fp)
{
return fp->is_eof;
}

19
bseq.h 100644
View File

@ -0,0 +1,19 @@
#ifndef MM_BSEQ_H
#define MM_BSEQ_H
#include <stdint.h>
struct bseq_file_s;
typedef struct bseq_file_s bseq_file_t;
typedef struct {
int l_seq, rid;
char *name, *seq;
} bseq1_t;
bseq_file_t *bseq_open(const char *fn);
void bseq_close(bseq_file_t *fp);
bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_);
int bseq_eof(bseq_file_t *fp);
#endif

216
kalloc.c 100644
View File

@ -0,0 +1,216 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "kalloc.h"
/* The whole thing is: ("@" for the kheader_t of the block, "-" for free
* memory, and "+" for allocated memory. One char for one unit.)
*
* This region is core 1. This region is core 2.
*
* @-------@++++++@++++++++++++@------------ @----------@++++++++++++@+++++++@------------
* | | | |
* p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr
*/
#define PTR(p) ((size_t*)((size_t*)p)[1])
typedef struct _allocated_t {
struct _allocated_t *next;
size_t *ptr;
} allocated_t;
typedef struct {
size_t base[2], *loop_head;
allocated_t list_head, *list_tail;
size_t total_allocated;
} kmem_t;
void *km_init()
{
return calloc(1, sizeof(kmem_t));
}
static void kerror(const char *s)
{
fprintf(stderr, "%s\n", s);
exit(1);
}
static size_t *morecore(kmem_t *km, size_t nu)
{
size_t rnu, *up;
rnu = (nu + 0xfffff) & (~(size_t)0xfffff);
up = (size_t*)malloc(rnu * sizeof(size_t));
if (!up) { /* fail to allocate memory */
km_stat(km);
fprintf(stderr, "[morecore] %lu bytes requested but not available.\n", rnu * sizeof(size_t));
exit(1);
}
/* put the pointer in km->list_head */
if (km->list_tail == 0) km->list_tail = &km->list_head;
km->list_tail->ptr = up;
km->list_tail->next = (allocated_t*)calloc(1, sizeof(allocated_t));
km->list_tail = km->list_tail->next;
km->total_allocated += rnu * sizeof(size_t);
*up = rnu; /* the size of the current block, and in this case the block is the same as the new core */
kfree(km, up + 1); /* initialize the new "core" */
return km->loop_head;
}
void km_destroy(void *_km)
{
kmem_t *km = (kmem_t*)_km;
allocated_t *p, *q;
if (km == 0) return;
p = &km->list_head;
do {
q = p->next;
free(p->ptr);
if (p != &km->list_head) free(p);
p = q;
} while (p && p->next);
if (p != &km->list_head) free(p);
free(km);
}
void kfree(void *_km, void *ap)
{
size_t *p, *q;
kmem_t *km = (kmem_t*)_km;
if (!ap) return;
if (km == 0) {
free(ap);
return;
}
p = (size_t*)ap - 1; /* *p is the size of the current block */
/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
*
* a) "p>q && p<q->ptr": @------@++++++++@+++++++@------- @---------------@+++++++@-------
* (can also be in | | | -> | |
* two cores) q p q->ptr q q->ptr
*
* @-------- @+++++++++@-------- @-------- @------------------
* | | | -> | |
* q p q->ptr q q->ptr
*
* b) "q>=q->ptr && (p>q || p<q->ptr)": @-------@+++++ @--------@+++++++ @-------@+++++ @----------------
* | | | -> | |
* q->ptr q p q->ptr q
*
* @+++++++@----- @++++++++@------- @------------- @++++++++@-------
* | | | -> | |
* p q->ptr q q->ptr q
*/
for (q = km->loop_head; !(p > q && p < PTR(q)); q = PTR(q))
if (q >= PTR(q) && (p > q || p < PTR(q))) break;
if (p + (*p) == PTR(q)) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
*p += *PTR(q); /* this is the new q->ptr size */
p[1] = (size_t)PTR(PTR(q)); /* this is the new q->ptr->ptr */
/* p is actually the new q->ptr. The actual change happens a few lines below. */
} else if (p + (*p) > PTR(q) && PTR(q) >= p) { /* the end of the allocated block is in the next free block */
kerror("[kfree] The end of the allocated block enters a free block.");
} else p[1] = (size_t)PTR(q); /* backup q->ptr */
if (q + (*q) == p) { /* two adjacent blocks, merge q and p (the other two cases) */
*q += *p;
q[1] = (size_t)PTR(p);
km->loop_head = q;
} else if (q + (*q) > p && p >= q) { /* the end of a free block in the allocated block */
kerror("[kfree] The end of a free block enters the allocated block.");
} else km->loop_head = p, q[1] = (size_t)p; /* in two cores, cannot be merged */
}
#define round_up(v) (--(v), (v)|=(v)>>1, (v)|=(v)>>2, (v)|=(v)>>4, (v)|=(v)>>8, (v)|=(v)>>16, (v)|=(v)>>32, ++(v))
void *krealloc(void *_km, void *ap, size_t n_bytes)
{
kmem_t *km = (kmem_t*)_km;
size_t n_units, *p, *q;
if (n_bytes == 0) {
kfree(km, ap); return 0;
}
if (km == 0) return realloc(ap, n_bytes);
if (!ap) return kmalloc(km, n_bytes);
n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
p = (size_t*)ap - 1;
if (*p >= n_units) return ap;
q = kmalloc(km, n_bytes);
if (q != ap) memcpy(q, ap, (*p - 1) * sizeof(size_t));
kfree(km, ap);
return q;
}
void *kmalloc(void *_km, size_t n_bytes)
{
kmem_t *km = (kmem_t*)_km;
size_t n_units, *p, *q;
if (n_bytes == 0) return 0;
if (km == 0) return malloc(n_bytes);
/* "n_units" means the number of units. The size of one unit equals to sizeof(kheader_t).
* "1" is the kheader_t of a block, which is always required. */
n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
round_up(n_units);
if (!(q = km->loop_head)) { /* the first time when kmalloc() is called, intialization */
km->base[1] = (size_t)(km->loop_head = q = km->base); *q = 0;
}
for (p = PTR(q);; q = p, p = PTR(p)) { /* search for a suitable block */
if (*p >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
if (*p == n_units) q[1] = (size_t)PTR(p); /* no need to split the block */
else { /* split the block */
/* memory is allocated at the end of the block */
*p -= n_units; /* reduce the size of the free block */
p += *p; /* skip to the kheader_t of the allocated block */
*p = n_units; /* set the size */
}
km->loop_head = q; /* set the end of chain */
return p + 1; /* skip the kheader_t */
}
if (p == km->loop_head) { /* then ask for more "cores" */
if ((p = morecore(km, n_units)) == 0) return 0;
}
}
}
void *kcalloc(void *_km, size_t count, size_t size)
{
kmem_t *km = (kmem_t*)_km;
char *p;
if (size == 0 || count == 0) return 0;
if (km == 0) return calloc(count, size);
p = kmalloc(km, count * size);
memset(p, 0, count * size);
return p;
}
void km_stat(const void *_km)
{
kmem_t *km = (kmem_t*)_km;
unsigned n_blocks, n_units;
size_t max_block = 0, *p, *q;
float frag;
if (km == 0 || !(p = km->loop_head)) return;
n_blocks = n_units = 0;
do {
q = PTR(p);
if (*p > max_block) max_block = *p;
n_units += *p;
if (p + (*p) > q && q > p)
kerror("[kr_stat] The end of a free block enters another free block.");
p = q;
++n_blocks;
} while (p != km->loop_head);
--n_blocks;
frag = 1.0/1024.0 * n_units * sizeof(size_t) / n_blocks;
fprintf(stderr, "[kr_stat] tot=%lu, free=%lu, n_block=%u, max_block=%lu, frag_len=%.3fK\n",
km->total_allocated, n_units * sizeof(size_t), n_blocks, max_block * sizeof(size_t), frag);
}

26
kalloc.h 100644
View File

@ -0,0 +1,26 @@
#ifndef _KALLOC_H_
#define _KALLOC_H_
#include <stdlib.h>
#define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t))
#ifdef __cplusplus
extern "C" {
#endif
void *kmalloc(void *km, size_t size);
void *krealloc(void *km, void *ptr, size_t size);
void *kcalloc(void *km, size_t count, size_t size);
void kfree(void *km, void *ptr);
void *km_init(void);
void km_destroy(void *km);
void km_stat(const void *km); // TODO: return numbers instead of print to stderr
#ifdef __cplusplus
}
#endif
#endif

248
kseq.h 100644
View File

@ -0,0 +1,248 @@
/* The MIT License
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Last Modified: 05MAR2012 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define KS_SEP_MAX 2
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
int begin, end; \
int is_eof:2, bufsize:30; \
type_t f; \
unsigned char *buf; \
} kstream_t;
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
#define __KS_BASIC(SCOPE, type_t, __bufsize) \
SCOPE kstream_t *ks_init(type_t f) \
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; ks->bufsize = __bufsize; \
ks->buf = (unsigned char*)malloc(__bufsize); \
return ks; \
} \
SCOPE void ks_destroy(kstream_t *ks) \
{ \
if (!ks) return; \
free(ks->buf); \
free(ks); \
}
#define __KS_INLINED(__read) \
static inline int ks_getc(kstream_t *ks) \
{ \
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
if (ks->end < ks->bufsize) ks->is_eof = 1; \
if (ks->end == 0) return -1; \
} \
return (int)ks->buf[ks->begin++]; \
} \
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
#ifndef KSTRING_T
#define KSTRING_T kstring_t
typedef struct __kstring_t {
unsigned l, m;
char *s;
} kstring_t;
#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define __KS_GETUNTIL(SCOPE, __read) \
SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
if (ks->end < ks->bufsize) ks->is_eof = 1; \
if (ks->end == 0) break; \
} else break; \
} \
if (delimiter == KS_SEP_LINE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == '\n') break; \
} else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
} else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
} else if (delimiter == KS_SEP_TAB) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
} else i = 0; /* never come to here! */ \
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
if (i < ks->end) { \
if (dret) *dret = ks->buf[i]; \
break; \
} \
} \
if (str->s == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
str->s[str->l] = '\0'; \
return str->l; \
}
#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
__KS_BASIC(SCOPE, type_t, __bufsize) \
__KS_GETUNTIL(SCOPE, __read) \
__KS_INLINED(__read)
#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
#define KSTREAM_DECLARE(type_t, __read) \
__KS_TYPE(type_t) \
extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
extern kstream_t *ks_init(type_t f); \
extern void ks_destroy(kstream_t *ks); \
__KS_INLINED(__read)
/******************
* FASTA/Q parser *
******************/
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
#define __KSEQ_BASIC(SCOPE, type_t) \
SCOPE kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
SCOPE void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
ks_destroy(ks->f); \
free(ks); \
}
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
*/
#define __KSEQ_READ(SCOPE) \
SCOPE int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* else: the first header char has been read in the previous call */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
seq->seq.m = 256; \
seq->seq.s = (char*)malloc(seq->seq.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
if (c == '\n') continue; /* skip empty lines */ \
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
if (c == -1) return -2; /* error: no quality string */ \
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
seq->last_char = 0; /* we have not come to the next header line */ \
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
typedef struct { \
kstring_t name, comment, seq, qual; \
int last_char; \
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT2(SCOPE, type_t, __read) \
KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(SCOPE, type_t) \
__KSEQ_READ(SCOPE)
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
#define KSEQ_DECLARE(type_t) \
__KS_TYPE(type_t) \
__KSEQ_TYPE(type_t) \
extern kseq_t *kseq_init(type_t fd); \
void kseq_destroy(kseq_t *ks); \
int kseq_read(kseq_t *seq);
#endif

105
kvec.h 100644
View File

@ -0,0 +1,105 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "kvec.h"
int main() {
kvec_t(int) array;
kv_init(array);
kv_push(int, array, 10); // append
kv_a(int, array, 20) = 5; // dynamic
kv_A(array, 20) = 4; // static
kv_destroy(array);
return 0;
}
*/
/*
2008-09-22 (0.1.0):
* The initial version.
*/
#ifndef AC_KVEC_H
#define AC_KVEC_H
#include <stdlib.h>
#include "kalloc.h"
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#define kvec_t(type) struct { size_t n, m; type *a; }
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
#define kv_destroy(v) free((v).a)
#define kv_A(v, i) ((v).a[(i)])
#define kv_pop(v) ((v).a[--(v).n])
#define kv_size(v) ((v).n)
#define kv_max(v) ((v).m)
#define kv_resize(type, km, v, s) do { \
if ((v).m < (s)) { \
(v).m = (s); \
kv_roundup32((v).m); \
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
} \
} while (0)
#define kv_copy(type, km, v1, v0) do { \
if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \
(v1).n = (v0).n; \
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
} while (0) \
#define kv_push(type, km, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)
#define kv_pushp(type, km, v, p) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
} \
*(p) = &(v).a[(v).n++]; \
} while (0)
#define kv_reverse(type, v, start) do { \
if ((v).m > 0 && (v).n > (start)) { \
size_t __i, __end = (v).n - (start); \
type *__a = (v).a + (start); \
for (__i = 0; __i < __end>>1; ++__i) { \
type __t = __a[__end - 1 - __i]; \
__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
} \
} \
} while (0)
#endif

104
minimap.h 100644
View File

@ -0,0 +1,104 @@
#ifndef MINIMAP_H
#define MINIMAP_H
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
#include "bseq.h"
#define MM_IDX_DEF_B 14
#define MM_DEREP_Q50 5.0
#define MM_F_WITH_REP 0x1
#define MM_F_NO_SELF 0x2
#define MM_F_NO_ISO 0x4
#define MM_F_AVA 0x8
typedef struct {
uint64_t x, y;
} mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
typedef struct {
mm128_v a; // (minimizer, position) array
int32_t n; // size of the _p_ array
uint64_t *p; // position array for minimizers appearing >1 times
void *h; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t;
typedef struct {
int b, w, k;
uint32_t n; // number of reference sequences
mm_idx_bucket_t *B;
uint32_t max_occ;
float freq_thres;
int32_t *len; // length of each reference sequence
char **name; // TODO: if this uses too much RAM, switch to one concatenated string
} mm_idx_t;
typedef struct {
uint32_t cnt:31, rev:1;
uint32_t rid:31, rep:1;
uint32_t len;
int32_t qs, qe, rs, re;
} mm_reg1_t;
typedef struct {
int radius; // bandwidth to cluster hits
int max_gap; // break a chain if there are no minimizers in a max_gap window
int min_cnt; // minimum number of minimizers to start a chain
int min_match;
int sdust_thres; // score threshold for SDUST; 0 to disable
int flag; // see MM_F_* macros
float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains
} mm_mapopt_t;
extern int mm_verbose;
extern double mm_realtime0;
struct mm_tbuf_s;
typedef struct mm_tbuf_s mm_tbuf_t;
#ifdef __cplusplus
extern "C" {
#endif
// compute minimizers
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
// minimizer indexing
mm_idx_t *mm_idx_init(int w, int k, int b);
void mm_idx_destroy(mm_idx_t *mi);
mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
void mm_idx_set_max_occ(mm_idx_t *mi, float f);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads);
// minimizer index I/O
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
mm_idx_t *mm_idx_load(FILE *fp);
// mapping
void mm_mapopt_init(mm_mapopt_t *opt);
mm_tbuf_t *mm_tbuf_init(void);
void mm_tbuf_destroy(mm_tbuf_t *b);
const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size);
// private functions (may be moved to a "mmpriv.h" in future)
double cputime(void);
double realtime(void);
void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -36,67 +36,108 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask)
return key;
}
typedef struct { // a simplified version of kdq
int front, count;
int a[32];
} tiny_queue_t;
static inline void tq_push(tiny_queue_t *q, int x)
{
q->a[((q->count++) + q->front) & 0x1f] = x;
}
static inline int tq_shift(tiny_queue_t *q)
{
int x;
if (q->count == 0) return -1;
x = q->a[q->front++];
q->front &= 0x1f;
--q->count;
return x;
}
/**
* Find symmetric (w,k)-minimizers on a DNA sequence
*
* @param km thread-local memory pool; using NULL falls back to malloc()
* @param str DNA sequence
* @param len length of $str
* @param w find a minimizer for every $w consecutive k-mers
* @param k k-mer size
* @param rid reference ID; will be copied to the output $p array
* @param p minimizers; p->a[i].x is the 2k-bit hash value;
* @param is_hpc homopolymer-compressed or not
* @param p minimizers
* p->a[i].x = kMer<<8 | kmerSpan
* p->a[i].y = rid<<32 | lastPos<<1 | strand
* where lastPos is the position of the last base of the i-th minimizer,
* and strand indicates whether the minimizer comes from the top or the bottom strand.
* Callers may want to set "p->n = 0"; otherwise results are appended to p
*/
void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p)
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
{
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
int i, j, l, buf_pos, min_pos;
int i, j, l, buf_pos, min_pos, kmer_span = 0;
mm128_t *buf, min = { UINT64_MAX, UINT64_MAX };
tiny_queue_t tq;
assert(len > 0 && w > 0 && k > 0);
assert(len > 0 && w > 0 && k > 0 && k <= 28); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
buf = (mm128_t*)alloca(w * 16);
memset(buf, 0xff, w * 16);
memset(&tq, 0, sizeof(tiny_queue_t));
kv_resize(mm128_t, km, *p, p->n + len/w);
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
int c = seq_nt4_table[(uint8_t)str[i]];
mm128_t info = { UINT64_MAX, UINT64_MAX };
if (c < 4) { // not an ambiguous base
int z;
if (is_hpc) {
int skip_len = 1;
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
for (skip_len = 2; i + skip_len < len; ++skip_len)
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
break;
i += skip_len - 1; // put $i at the end of the current homopolymer run
}
tq_push(&tq, skip_len);
kmer_span += skip_len;
if (tq.count > k) kmer_span -= tq_shift(&tq);
if (kmer_span >= 256) continue; // make sure $kmer_span does not take more than 8 bits
} else kmer_span = l + 1 < k? l + 1 : k;
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
z = kmer[0] < kmer[1]? 0 : 1; // strand
if (++l >= k)
info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
} else l = 0;
if (++l >= k) {
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
}
} else l = 0, tq.count = tq.front = 0;
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet
for (j = buf_pos + 1; j < w; ++j)
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
for (j = 0; j < buf_pos; ++j)
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
}
if (info.x <= min.x) { // a new minimum; then write the old min
if (l >= w + k) kv_push(mm128_t, *p, min);
if (l >= w + k) kv_push(mm128_t, km, *p, min);
min = info, min_pos = buf_pos;
} else if (buf_pos == min_pos) { // old min has moved outside the window
if (l >= w + k - 1) kv_push(mm128_t, *p, min);
if (l >= w + k - 1) kv_push(mm128_t, km, *p, min);
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
for (j = 0; j <= buf_pos; ++j)
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
if (l >= w + k - 1) { // write identical k-mers
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
for (j = 0; j <= buf_pos; ++j)
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
}
}
if (++buf_pos == w) buf_pos = 0;
}
if (min.x != UINT64_MAX)
kv_push(mm128_t, *p, min);
kv_push(mm128_t, km, *p, min);
}