Homopolymer-compressed k-mer sketch
This commit is contained in:
parent
2bb0839177
commit
01baa847a1
|
|
@ -0,0 +1,65 @@
|
|||
#include <zlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "bseq.h"
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, gzread)
|
||||
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
||||
struct bseq_file_s {
|
||||
int is_eof;
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
};
|
||||
|
||||
bseq_file_t *bseq_open(const char *fn)
|
||||
{
|
||||
bseq_file_t *fp;
|
||||
gzFile f;
|
||||
f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||
if (f == 0) return 0;
|
||||
fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t));
|
||||
fp->fp = f;
|
||||
fp->ks = kseq_init(fp->fp);
|
||||
return fp;
|
||||
}
|
||||
|
||||
void bseq_close(bseq_file_t *fp)
|
||||
{
|
||||
kseq_destroy(fp->ks);
|
||||
gzclose(fp->fp);
|
||||
free(fp);
|
||||
}
|
||||
|
||||
bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_)
|
||||
{
|
||||
int size = 0, m, n;
|
||||
bseq1_t *seqs;
|
||||
kseq_t *ks = fp->ks;
|
||||
m = n = 0; seqs = 0;
|
||||
while (kseq_read(ks) >= 0) {
|
||||
bseq1_t *s;
|
||||
assert(ks->seq.l <= INT32_MAX);
|
||||
if (n >= m) {
|
||||
m = m? m<<1 : 256;
|
||||
seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t));
|
||||
}
|
||||
s = &seqs[n];
|
||||
s->name = strdup(ks->name.s);
|
||||
s->seq = strdup(ks->seq.s);
|
||||
s->l_seq = ks->seq.l;
|
||||
size += seqs[n++].l_seq;
|
||||
if (size >= chunk_size) break;
|
||||
}
|
||||
if (n == 0) fp->is_eof = 1;
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
}
|
||||
|
||||
int bseq_eof(bseq_file_t *fp)
|
||||
{
|
||||
return fp->is_eof;
|
||||
}
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
#ifndef MM_BSEQ_H
|
||||
#define MM_BSEQ_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
struct bseq_file_s;
|
||||
typedef struct bseq_file_s bseq_file_t;
|
||||
|
||||
typedef struct {
|
||||
int l_seq, rid;
|
||||
char *name, *seq;
|
||||
} bseq1_t;
|
||||
|
||||
bseq_file_t *bseq_open(const char *fn);
|
||||
void bseq_close(bseq_file_t *fp);
|
||||
bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_);
|
||||
int bseq_eof(bseq_file_t *fp);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "kalloc.h"
|
||||
|
||||
/* The whole thing is: ("@" for the kheader_t of the block, "-" for free
|
||||
* memory, and "+" for allocated memory. One char for one unit.)
|
||||
*
|
||||
* This region is core 1. This region is core 2.
|
||||
*
|
||||
* @-------@++++++@++++++++++++@------------ @----------@++++++++++++@+++++++@------------
|
||||
* | | | |
|
||||
* p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr
|
||||
*/
|
||||
|
||||
#define PTR(p) ((size_t*)((size_t*)p)[1])
|
||||
|
||||
typedef struct _allocated_t {
|
||||
struct _allocated_t *next;
|
||||
size_t *ptr;
|
||||
} allocated_t;
|
||||
|
||||
typedef struct {
|
||||
size_t base[2], *loop_head;
|
||||
allocated_t list_head, *list_tail;
|
||||
size_t total_allocated;
|
||||
} kmem_t;
|
||||
|
||||
void *km_init()
|
||||
{
|
||||
return calloc(1, sizeof(kmem_t));
|
||||
}
|
||||
|
||||
static void kerror(const char *s)
|
||||
{
|
||||
fprintf(stderr, "%s\n", s);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static size_t *morecore(kmem_t *km, size_t nu)
|
||||
{
|
||||
size_t rnu, *up;
|
||||
|
||||
rnu = (nu + 0xfffff) & (~(size_t)0xfffff);
|
||||
up = (size_t*)malloc(rnu * sizeof(size_t));
|
||||
if (!up) { /* fail to allocate memory */
|
||||
km_stat(km);
|
||||
fprintf(stderr, "[morecore] %lu bytes requested but not available.\n", rnu * sizeof(size_t));
|
||||
exit(1);
|
||||
}
|
||||
/* put the pointer in km->list_head */
|
||||
if (km->list_tail == 0) km->list_tail = &km->list_head;
|
||||
km->list_tail->ptr = up;
|
||||
km->list_tail->next = (allocated_t*)calloc(1, sizeof(allocated_t));
|
||||
km->list_tail = km->list_tail->next;
|
||||
|
||||
km->total_allocated += rnu * sizeof(size_t);
|
||||
*up = rnu; /* the size of the current block, and in this case the block is the same as the new core */
|
||||
kfree(km, up + 1); /* initialize the new "core" */
|
||||
return km->loop_head;
|
||||
}
|
||||
|
||||
void km_destroy(void *_km)
|
||||
{
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
allocated_t *p, *q;
|
||||
if (km == 0) return;
|
||||
p = &km->list_head;
|
||||
do {
|
||||
q = p->next;
|
||||
free(p->ptr);
|
||||
if (p != &km->list_head) free(p);
|
||||
p = q;
|
||||
} while (p && p->next);
|
||||
if (p != &km->list_head) free(p);
|
||||
free(km);
|
||||
}
|
||||
|
||||
void kfree(void *_km, void *ap)
|
||||
{
|
||||
size_t *p, *q;
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
|
||||
if (!ap) return;
|
||||
if (km == 0) {
|
||||
free(ap);
|
||||
return;
|
||||
}
|
||||
p = (size_t*)ap - 1; /* *p is the size of the current block */
|
||||
/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
|
||||
*
|
||||
* a) "p>q && p<q->ptr": @------@++++++++@+++++++@------- @---------------@+++++++@-------
|
||||
* (can also be in | | | -> | |
|
||||
* two cores) q p q->ptr q q->ptr
|
||||
*
|
||||
* @-------- @+++++++++@-------- @-------- @------------------
|
||||
* | | | -> | |
|
||||
* q p q->ptr q q->ptr
|
||||
*
|
||||
* b) "q>=q->ptr && (p>q || p<q->ptr)": @-------@+++++ @--------@+++++++ @-------@+++++ @----------------
|
||||
* | | | -> | |
|
||||
* q->ptr q p q->ptr q
|
||||
*
|
||||
* @+++++++@----- @++++++++@------- @------------- @++++++++@-------
|
||||
* | | | -> | |
|
||||
* p q->ptr q q->ptr q
|
||||
*/
|
||||
for (q = km->loop_head; !(p > q && p < PTR(q)); q = PTR(q))
|
||||
if (q >= PTR(q) && (p > q || p < PTR(q))) break;
|
||||
if (p + (*p) == PTR(q)) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
|
||||
*p += *PTR(q); /* this is the new q->ptr size */
|
||||
p[1] = (size_t)PTR(PTR(q)); /* this is the new q->ptr->ptr */
|
||||
/* p is actually the new q->ptr. The actual change happens a few lines below. */
|
||||
} else if (p + (*p) > PTR(q) && PTR(q) >= p) { /* the end of the allocated block is in the next free block */
|
||||
kerror("[kfree] The end of the allocated block enters a free block.");
|
||||
} else p[1] = (size_t)PTR(q); /* backup q->ptr */
|
||||
|
||||
if (q + (*q) == p) { /* two adjacent blocks, merge q and p (the other two cases) */
|
||||
*q += *p;
|
||||
q[1] = (size_t)PTR(p);
|
||||
km->loop_head = q;
|
||||
} else if (q + (*q) > p && p >= q) { /* the end of a free block in the allocated block */
|
||||
kerror("[kfree] The end of a free block enters the allocated block.");
|
||||
} else km->loop_head = p, q[1] = (size_t)p; /* in two cores, cannot be merged */
|
||||
}
|
||||
|
||||
#define round_up(v) (--(v), (v)|=(v)>>1, (v)|=(v)>>2, (v)|=(v)>>4, (v)|=(v)>>8, (v)|=(v)>>16, (v)|=(v)>>32, ++(v))
|
||||
|
||||
void *krealloc(void *_km, void *ap, size_t n_bytes)
|
||||
{
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
size_t n_units, *p, *q;
|
||||
|
||||
if (n_bytes == 0) {
|
||||
kfree(km, ap); return 0;
|
||||
}
|
||||
if (km == 0) return realloc(ap, n_bytes);
|
||||
if (!ap) return kmalloc(km, n_bytes);
|
||||
n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
|
||||
p = (size_t*)ap - 1;
|
||||
if (*p >= n_units) return ap;
|
||||
q = kmalloc(km, n_bytes);
|
||||
if (q != ap) memcpy(q, ap, (*p - 1) * sizeof(size_t));
|
||||
kfree(km, ap);
|
||||
return q;
|
||||
}
|
||||
|
||||
void *kmalloc(void *_km, size_t n_bytes)
|
||||
{
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
size_t n_units, *p, *q;
|
||||
|
||||
if (n_bytes == 0) return 0;
|
||||
if (km == 0) return malloc(n_bytes);
|
||||
/* "n_units" means the number of units. The size of one unit equals to sizeof(kheader_t).
|
||||
* "1" is the kheader_t of a block, which is always required. */
|
||||
n_units = 1 + (n_bytes + sizeof(size_t) - 1) / sizeof(size_t);
|
||||
round_up(n_units);
|
||||
|
||||
if (!(q = km->loop_head)) { /* the first time when kmalloc() is called, intialization */
|
||||
km->base[1] = (size_t)(km->loop_head = q = km->base); *q = 0;
|
||||
}
|
||||
for (p = PTR(q);; q = p, p = PTR(p)) { /* search for a suitable block */
|
||||
if (*p >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
|
||||
if (*p == n_units) q[1] = (size_t)PTR(p); /* no need to split the block */
|
||||
else { /* split the block */
|
||||
/* memory is allocated at the end of the block */
|
||||
*p -= n_units; /* reduce the size of the free block */
|
||||
p += *p; /* skip to the kheader_t of the allocated block */
|
||||
*p = n_units; /* set the size */
|
||||
}
|
||||
km->loop_head = q; /* set the end of chain */
|
||||
return p + 1; /* skip the kheader_t */
|
||||
}
|
||||
if (p == km->loop_head) { /* then ask for more "cores" */
|
||||
if ((p = morecore(km, n_units)) == 0) return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void *kcalloc(void *_km, size_t count, size_t size)
|
||||
{
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
char *p;
|
||||
if (size == 0 || count == 0) return 0;
|
||||
if (km == 0) return calloc(count, size);
|
||||
p = kmalloc(km, count * size);
|
||||
memset(p, 0, count * size);
|
||||
return p;
|
||||
}
|
||||
|
||||
void km_stat(const void *_km)
|
||||
{
|
||||
kmem_t *km = (kmem_t*)_km;
|
||||
unsigned n_blocks, n_units;
|
||||
size_t max_block = 0, *p, *q;
|
||||
float frag;
|
||||
|
||||
if (km == 0 || !(p = km->loop_head)) return;
|
||||
n_blocks = n_units = 0;
|
||||
do {
|
||||
q = PTR(p);
|
||||
if (*p > max_block) max_block = *p;
|
||||
n_units += *p;
|
||||
if (p + (*p) > q && q > p)
|
||||
kerror("[kr_stat] The end of a free block enters another free block.");
|
||||
p = q;
|
||||
++n_blocks;
|
||||
} while (p != km->loop_head);
|
||||
|
||||
--n_blocks;
|
||||
frag = 1.0/1024.0 * n_units * sizeof(size_t) / n_blocks;
|
||||
fprintf(stderr, "[kr_stat] tot=%lu, free=%lu, n_block=%u, max_block=%lu, frag_len=%.3fK\n",
|
||||
km->total_allocated, n_units * sizeof(size_t), n_blocks, max_block * sizeof(size_t), frag);
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#ifndef _KALLOC_H_
|
||||
#define _KALLOC_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define km_size(x) (*(((size_t*)(x))-1) * sizeof(size_t))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void *kmalloc(void *km, size_t size);
|
||||
void *krealloc(void *km, void *ptr, size_t size);
|
||||
void *kcalloc(void *km, size_t count, size_t size);
|
||||
void kfree(void *km, void *ptr);
|
||||
|
||||
void *km_init(void);
|
||||
void km_destroy(void *km);
|
||||
|
||||
void km_stat(const void *km); // TODO: return numbers instead of print to stderr
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,248 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Last Modified: 05MAR2012 */
|
||||
|
||||
#ifndef AC_KSEQ_H
|
||||
#define AC_KSEQ_H
|
||||
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
|
||||
#define KS_SEP_TAB 1 // isspace() && !' '
|
||||
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
|
||||
#define KS_SEP_MAX 2
|
||||
|
||||
#define __KS_TYPE(type_t) \
|
||||
typedef struct __kstream_t { \
|
||||
int begin, end; \
|
||||
int is_eof:2, bufsize:30; \
|
||||
type_t f; \
|
||||
unsigned char *buf; \
|
||||
} kstream_t;
|
||||
|
||||
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
|
||||
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
|
||||
|
||||
#define __KS_BASIC(SCOPE, type_t, __bufsize) \
|
||||
SCOPE kstream_t *ks_init(type_t f) \
|
||||
{ \
|
||||
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
||||
ks->f = f; ks->bufsize = __bufsize; \
|
||||
ks->buf = (unsigned char*)malloc(__bufsize); \
|
||||
return ks; \
|
||||
} \
|
||||
SCOPE void ks_destroy(kstream_t *ks) \
|
||||
{ \
|
||||
if (!ks) return; \
|
||||
free(ks->buf); \
|
||||
free(ks); \
|
||||
}
|
||||
|
||||
#define __KS_INLINED(__read) \
|
||||
static inline int ks_getc(kstream_t *ks) \
|
||||
{ \
|
||||
if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
|
||||
if (ks->end < ks->bufsize) ks->is_eof = 1; \
|
||||
if (ks->end == 0) return -1; \
|
||||
} \
|
||||
return (int)ks->buf[ks->begin++]; \
|
||||
} \
|
||||
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
||||
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
|
||||
|
||||
#ifndef KSTRING_T
|
||||
#define KSTRING_T kstring_t
|
||||
typedef struct __kstring_t {
|
||||
unsigned l, m;
|
||||
char *s;
|
||||
} kstring_t;
|
||||
#endif
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#define __KS_GETUNTIL(SCOPE, __read) \
|
||||
SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
|
||||
{ \
|
||||
if (dret) *dret = 0; \
|
||||
str->l = append? str->l : 0; \
|
||||
if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
||||
for (;;) { \
|
||||
int i; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
if (!ks->is_eof) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, ks->bufsize); \
|
||||
if (ks->end < ks->bufsize) ks->is_eof = 1; \
|
||||
if (ks->end == 0) break; \
|
||||
} else break; \
|
||||
} \
|
||||
if (delimiter == KS_SEP_LINE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == '\n') break; \
|
||||
} else if (delimiter > KS_SEP_MAX) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == delimiter) break; \
|
||||
} else if (delimiter == KS_SEP_SPACE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i])) break; \
|
||||
} else if (delimiter == KS_SEP_TAB) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
||||
} else i = 0; /* never come to here! */ \
|
||||
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
|
||||
str->m = str->l + (i - ks->begin) + 1; \
|
||||
kroundup32(str->m); \
|
||||
str->s = (char*)realloc(str->s, str->m); \
|
||||
} \
|
||||
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
||||
str->l = str->l + (i - ks->begin); \
|
||||
ks->begin = i + 1; \
|
||||
if (i < ks->end) { \
|
||||
if (dret) *dret = ks->buf[i]; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
if (str->s == 0) { \
|
||||
str->m = 1; \
|
||||
str->s = (char*)calloc(1, 1); \
|
||||
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
|
||||
str->s[str->l] = '\0'; \
|
||||
return str->l; \
|
||||
}
|
||||
|
||||
#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \
|
||||
__KS_TYPE(type_t) \
|
||||
__KS_BASIC(SCOPE, type_t, __bufsize) \
|
||||
__KS_GETUNTIL(SCOPE, __read) \
|
||||
__KS_INLINED(__read)
|
||||
|
||||
#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize)
|
||||
|
||||
#define KSTREAM_DECLARE(type_t, __read) \
|
||||
__KS_TYPE(type_t) \
|
||||
extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \
|
||||
extern kstream_t *ks_init(type_t f); \
|
||||
extern void ks_destroy(kstream_t *ks); \
|
||||
__KS_INLINED(__read)
|
||||
|
||||
/******************
|
||||
* FASTA/Q parser *
|
||||
******************/
|
||||
|
||||
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
|
||||
|
||||
#define __KSEQ_BASIC(SCOPE, type_t) \
|
||||
SCOPE kseq_t *kseq_init(type_t fd) \
|
||||
{ \
|
||||
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
||||
s->f = ks_init(fd); \
|
||||
return s; \
|
||||
} \
|
||||
SCOPE void kseq_destroy(kseq_t *ks) \
|
||||
{ \
|
||||
if (!ks) return; \
|
||||
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
||||
ks_destroy(ks->f); \
|
||||
free(ks); \
|
||||
}
|
||||
|
||||
/* Return value:
|
||||
>=0 length of the sequence (normal)
|
||||
-1 end-of-file
|
||||
-2 truncated quality string
|
||||
*/
|
||||
#define __KSEQ_READ(SCOPE) \
|
||||
SCOPE int kseq_read(kseq_t *seq) \
|
||||
{ \
|
||||
int c; \
|
||||
kstream_t *ks = seq->f; \
|
||||
if (seq->last_char == 0) { /* then jump to the next header line */ \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
||||
if (c == -1) return -1; /* end of file */ \
|
||||
seq->last_char = c; \
|
||||
} /* else: the first header char has been read in the previous call */ \
|
||||
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
|
||||
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
|
||||
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
|
||||
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
|
||||
seq->seq.m = 256; \
|
||||
seq->seq.s = (char*)malloc(seq->seq.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
||||
if (c == '\n') continue; /* skip empty lines */ \
|
||||
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
|
||||
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
|
||||
} \
|
||||
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
||||
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
|
||||
seq->seq.m = seq->seq.l + 2; \
|
||||
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
|
||||
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
|
||||
} \
|
||||
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
||||
if (c != '+') return seq->seq.l; /* FASTA */ \
|
||||
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
|
||||
seq->qual.m = seq->seq.m; \
|
||||
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
||||
if (c == -1) return -2; /* error: no quality string */ \
|
||||
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
|
||||
seq->last_char = 0; /* we have not come to the next header line */ \
|
||||
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
|
||||
return seq->seq.l; \
|
||||
}
|
||||
|
||||
#define __KSEQ_TYPE(type_t) \
|
||||
typedef struct { \
|
||||
kstring_t name, comment, seq, qual; \
|
||||
int last_char; \
|
||||
kstream_t *f; \
|
||||
} kseq_t;
|
||||
|
||||
#define KSEQ_INIT2(SCOPE, type_t, __read) \
|
||||
KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
__KSEQ_BASIC(SCOPE, type_t) \
|
||||
__KSEQ_READ(SCOPE)
|
||||
|
||||
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
|
||||
|
||||
#define KSEQ_DECLARE(type_t) \
|
||||
__KS_TYPE(type_t) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
extern kseq_t *kseq_init(type_t fd); \
|
||||
void kseq_destroy(kseq_t *ks); \
|
||||
int kseq_read(kseq_t *seq);
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "kvec.h"
|
||||
int main() {
|
||||
kvec_t(int) array;
|
||||
kv_init(array);
|
||||
kv_push(int, array, 10); // append
|
||||
kv_a(int, array, 20) = 5; // dynamic
|
||||
kv_A(array, 20) = 4; // static
|
||||
kv_destroy(array);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2008-09-22 (0.1.0):
|
||||
|
||||
* The initial version.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef AC_KVEC_H
|
||||
#define AC_KVEC_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "kalloc.h"
|
||||
|
||||
#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
|
||||
#define kvec_t(type) struct { size_t n, m; type *a; }
|
||||
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
|
||||
#define kv_destroy(v) free((v).a)
|
||||
#define kv_A(v, i) ((v).a[(i)])
|
||||
#define kv_pop(v) ((v).a[--(v).n])
|
||||
#define kv_size(v) ((v).n)
|
||||
#define kv_max(v) ((v).m)
|
||||
|
||||
#define kv_resize(type, km, v, s) do { \
|
||||
if ((v).m < (s)) { \
|
||||
(v).m = (s); \
|
||||
kv_roundup32((v).m); \
|
||||
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define kv_copy(type, km, v1, v0) do { \
|
||||
if ((v1).m < (v0).n) kv_resize(type, (km), (v1), (v0).n); \
|
||||
(v1).n = (v0).n; \
|
||||
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
|
||||
} while (0) \
|
||||
|
||||
#define kv_push(type, km, v, x) do { \
|
||||
if ((v).n == (v).m) { \
|
||||
(v).m = (v).m? (v).m<<1 : 2; \
|
||||
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
|
||||
} \
|
||||
(v).a[(v).n++] = (x); \
|
||||
} while (0)
|
||||
|
||||
#define kv_pushp(type, km, v, p) do { \
|
||||
if ((v).n == (v).m) { \
|
||||
(v).m = (v).m? (v).m<<1 : 2; \
|
||||
(v).a = (type*)krealloc((km), (v).a, sizeof(type) * (v).m); \
|
||||
} \
|
||||
*(p) = &(v).a[(v).n++]; \
|
||||
} while (0)
|
||||
|
||||
#define kv_reverse(type, v, start) do { \
|
||||
if ((v).m > 0 && (v).n > (start)) { \
|
||||
size_t __i, __end = (v).n - (start); \
|
||||
type *__a = (v).a + (start); \
|
||||
for (__i = 0; __i < __end>>1; ++__i) { \
|
||||
type __t = __a[__end - 1 - __i]; \
|
||||
__a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
#ifndef MINIMAP_H
|
||||
#define MINIMAP_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include "bseq.h"
|
||||
|
||||
#define MM_IDX_DEF_B 14
|
||||
#define MM_DEREP_Q50 5.0
|
||||
|
||||
#define MM_F_WITH_REP 0x1
|
||||
#define MM_F_NO_SELF 0x2
|
||||
#define MM_F_NO_ISO 0x4
|
||||
#define MM_F_AVA 0x8
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} mm128_t;
|
||||
|
||||
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
|
||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
|
||||
|
||||
typedef struct {
|
||||
mm128_v a; // (minimizer, position) array
|
||||
int32_t n; // size of the _p_ array
|
||||
uint64_t *p; // position array for minimizers appearing >1 times
|
||||
void *h; // hash table indexing _p_ and minimizers appearing once
|
||||
} mm_idx_bucket_t;
|
||||
|
||||
typedef struct {
|
||||
int b, w, k;
|
||||
uint32_t n; // number of reference sequences
|
||||
mm_idx_bucket_t *B;
|
||||
uint32_t max_occ;
|
||||
float freq_thres;
|
||||
int32_t *len; // length of each reference sequence
|
||||
char **name; // TODO: if this uses too much RAM, switch to one concatenated string
|
||||
} mm_idx_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t cnt:31, rev:1;
|
||||
uint32_t rid:31, rep:1;
|
||||
uint32_t len;
|
||||
int32_t qs, qe, rs, re;
|
||||
} mm_reg1_t;
|
||||
|
||||
typedef struct {
|
||||
int radius; // bandwidth to cluster hits
|
||||
int max_gap; // break a chain if there are no minimizers in a max_gap window
|
||||
int min_cnt; // minimum number of minimizers to start a chain
|
||||
int min_match;
|
||||
int sdust_thres; // score threshold for SDUST; 0 to disable
|
||||
int flag; // see MM_F_* macros
|
||||
float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains
|
||||
} mm_mapopt_t;
|
||||
|
||||
extern int mm_verbose;
|
||||
extern double mm_realtime0;
|
||||
|
||||
struct mm_tbuf_s;
|
||||
typedef struct mm_tbuf_s mm_tbuf_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// compute minimizers
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p);
|
||||
|
||||
// minimizer indexing
|
||||
mm_idx_t *mm_idx_init(int w, int k, int b);
|
||||
void mm_idx_destroy(mm_idx_t *mi);
|
||||
mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name);
|
||||
void mm_idx_set_max_occ(mm_idx_t *mi, float f);
|
||||
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
|
||||
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads);
|
||||
|
||||
// minimizer index I/O
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
|
||||
mm_idx_t *mm_idx_load(FILE *fp);
|
||||
|
||||
// mapping
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_tbuf_t *mm_tbuf_init(void);
|
||||
void mm_tbuf_destroy(mm_tbuf_t *b);
|
||||
const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
|
||||
|
||||
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size);
|
||||
|
||||
// private functions (may be moved to a "mmpriv.h" in future)
|
||||
double cputime(void);
|
||||
double realtime(void);
|
||||
void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
||||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
69
sketch.c
69
sketch.c
|
|
@ -36,67 +36,108 @@ static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
|||
return key;
|
||||
}
|
||||
|
||||
typedef struct { // a simplified version of kdq
|
||||
int front, count;
|
||||
int a[32];
|
||||
} tiny_queue_t;
|
||||
|
||||
static inline void tq_push(tiny_queue_t *q, int x)
|
||||
{
|
||||
q->a[((q->count++) + q->front) & 0x1f] = x;
|
||||
}
|
||||
|
||||
static inline int tq_shift(tiny_queue_t *q)
|
||||
{
|
||||
int x;
|
||||
if (q->count == 0) return -1;
|
||||
x = q->a[q->front++];
|
||||
q->front &= 0x1f;
|
||||
--q->count;
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find symmetric (w,k)-minimizers on a DNA sequence
|
||||
*
|
||||
* @param km thread-local memory pool; using NULL falls back to malloc()
|
||||
* @param str DNA sequence
|
||||
* @param len length of $str
|
||||
* @param w find a minimizer for every $w consecutive k-mers
|
||||
* @param k k-mer size
|
||||
* @param rid reference ID; will be copied to the output $p array
|
||||
* @param p minimizers; p->a[i].x is the 2k-bit hash value;
|
||||
* @param is_hpc homopolymer-compressed or not
|
||||
* @param p minimizers
|
||||
* p->a[i].x = kMer<<8 | kmerSpan
|
||||
* p->a[i].y = rid<<32 | lastPos<<1 | strand
|
||||
* where lastPos is the position of the last base of the i-th minimizer,
|
||||
* and strand indicates whether the minimizer comes from the top or the bottom strand.
|
||||
* Callers may want to set "p->n = 0"; otherwise results are appended to p
|
||||
*/
|
||||
void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p)
|
||||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
||||
{
|
||||
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
|
||||
int i, j, l, buf_pos, min_pos;
|
||||
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
||||
mm128_t *buf, min = { UINT64_MAX, UINT64_MAX };
|
||||
tiny_queue_t tq;
|
||||
|
||||
assert(len > 0 && w > 0 && k > 0);
|
||||
assert(len > 0 && w > 0 && k > 0 && k <= 28); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
||||
buf = (mm128_t*)alloca(w * 16);
|
||||
memset(buf, 0xff, w * 16);
|
||||
memset(&tq, 0, sizeof(tiny_queue_t));
|
||||
kv_resize(mm128_t, km, *p, p->n + len/w);
|
||||
|
||||
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
||||
int c = seq_nt4_table[(uint8_t)str[i]];
|
||||
mm128_t info = { UINT64_MAX, UINT64_MAX };
|
||||
if (c < 4) { // not an ambiguous base
|
||||
int z;
|
||||
if (is_hpc) {
|
||||
int skip_len = 1;
|
||||
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
||||
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
||||
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
||||
break;
|
||||
i += skip_len - 1; // put $i at the end of the current homopolymer run
|
||||
}
|
||||
tq_push(&tq, skip_len);
|
||||
kmer_span += skip_len;
|
||||
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
||||
if (kmer_span >= 256) continue; // make sure $kmer_span does not take more than 8 bits
|
||||
} else kmer_span = l + 1 < k? l + 1 : k;
|
||||
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
||||
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
||||
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
||||
z = kmer[0] < kmer[1]? 0 : 1; // strand
|
||||
if (++l >= k)
|
||||
info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
||||
} else l = 0;
|
||||
if (++l >= k) {
|
||||
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
||||
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
||||
}
|
||||
} else l = 0, tq.count = tq.front = 0;
|
||||
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
||||
if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet
|
||||
for (j = buf_pos + 1; j < w; ++j)
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
for (j = 0; j < buf_pos; ++j)
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]);
|
||||
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
}
|
||||
if (info.x <= min.x) { // a new minimum; then write the old min
|
||||
if (l >= w + k) kv_push(mm128_t, *p, min);
|
||||
if (l >= w + k) kv_push(mm128_t, km, *p, min);
|
||||
min = info, min_pos = buf_pos;
|
||||
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
||||
if (l >= w + k - 1) kv_push(mm128_t, *p, min);
|
||||
if (l >= w + k - 1) kv_push(mm128_t, km, *p, min);
|
||||
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
||||
for (j = 0; j <= buf_pos; ++j)
|
||||
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
||||
if (l >= w + k - 1) { // write identical k-mers
|
||||
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
for (j = 0; j <= buf_pos; ++j)
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]);
|
||||
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
||||
}
|
||||
}
|
||||
if (++buf_pos == w) buf_pos = 0;
|
||||
}
|
||||
if (min.x != UINT64_MAX)
|
||||
kv_push(mm128_t, *p, min);
|
||||
kv_push(mm128_t, km, *p, min);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue