2017-04-06 03:39:55 +08:00
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#include <string.h>
|
2018-01-19 23:40:18 +08:00
|
|
|
#define __STDC_LIMIT_MACROS
|
2017-04-06 03:39:55 +08:00
|
|
|
#include "kvec.h"
|
2018-01-19 23:40:18 +08:00
|
|
|
#include "mmpriv.h"
|
2017-04-06 03:39:55 +08:00
|
|
|
|
|
|
|
|
unsigned char seq_nt4_table[256] = {
|
2017-10-08 06:53:25 +08:00
|
|
|
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
2017-04-06 03:39:55 +08:00
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
|
|
|
|
{
|
|
|
|
|
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
|
|
|
|
|
key = key ^ key >> 24;
|
|
|
|
|
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
|
|
|
|
|
key = key ^ key >> 14;
|
|
|
|
|
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
|
|
|
|
|
key = key ^ key >> 28;
|
|
|
|
|
key = (key + (key << 31)) & mask;
|
|
|
|
|
return key;
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-07 03:37:34 +08:00
|
|
|
typedef struct { // a simplified version of kdq
|
|
|
|
|
int front, count;
|
|
|
|
|
int a[32];
|
|
|
|
|
} tiny_queue_t;
|
|
|
|
|
|
|
|
|
|
static inline void tq_push(tiny_queue_t *q, int x)
|
|
|
|
|
{
|
|
|
|
|
q->a[((q->count++) + q->front) & 0x1f] = x;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int tq_shift(tiny_queue_t *q)
|
|
|
|
|
{
|
|
|
|
|
int x;
|
|
|
|
|
if (q->count == 0) return -1;
|
|
|
|
|
x = q->a[q->front++];
|
|
|
|
|
q->front &= 0x1f;
|
|
|
|
|
--q->count;
|
|
|
|
|
return x;
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-06 03:39:55 +08:00
|
|
|
/**
|
|
|
|
|
* Find symmetric (w,k)-minimizers on a DNA sequence
|
|
|
|
|
*
|
2017-04-07 03:37:34 +08:00
|
|
|
* @param km thread-local memory pool; using NULL falls back to malloc()
|
2017-04-06 03:39:55 +08:00
|
|
|
* @param str DNA sequence
|
|
|
|
|
* @param len length of $str
|
|
|
|
|
* @param w find a minimizer for every $w consecutive k-mers
|
|
|
|
|
* @param k k-mer size
|
|
|
|
|
* @param rid reference ID; will be copied to the output $p array
|
2017-04-07 03:37:34 +08:00
|
|
|
* @param is_hpc homopolymer-compressed or not
|
|
|
|
|
* @param p minimizers
|
|
|
|
|
* p->a[i].x = kMer<<8 | kmerSpan
|
2017-04-06 03:39:55 +08:00
|
|
|
* p->a[i].y = rid<<32 | lastPos<<1 | strand
|
|
|
|
|
* where lastPos is the position of the last base of the i-th minimizer,
|
|
|
|
|
* and strand indicates whether the minimizer comes from the top or the bottom strand.
|
|
|
|
|
* Callers may want to set "p->n = 0"; otherwise results are appended to p
|
|
|
|
|
*/
|
2017-04-07 03:37:34 +08:00
|
|
|
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
2017-04-06 03:39:55 +08:00
|
|
|
{
|
|
|
|
|
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0};
|
2017-04-07 03:37:34 +08:00
|
|
|
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
2017-09-03 06:23:29 +08:00
|
|
|
mm128_t buf[256], min = { UINT64_MAX, UINT64_MAX };
|
2017-04-07 03:37:34 +08:00
|
|
|
tiny_queue_t tq;
|
2017-04-06 03:39:55 +08:00
|
|
|
|
2017-09-03 06:23:29 +08:00
|
|
|
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
2017-04-06 03:39:55 +08:00
|
|
|
memset(buf, 0xff, w * 16);
|
2017-04-07 03:37:34 +08:00
|
|
|
memset(&tq, 0, sizeof(tiny_queue_t));
|
|
|
|
|
kv_resize(mm128_t, km, *p, p->n + len/w);
|
2017-04-06 03:39:55 +08:00
|
|
|
|
|
|
|
|
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
|
|
|
|
int c = seq_nt4_table[(uint8_t)str[i]];
|
|
|
|
|
mm128_t info = { UINT64_MAX, UINT64_MAX };
|
|
|
|
|
if (c < 4) { // not an ambiguous base
|
|
|
|
|
int z;
|
2017-04-07 03:37:34 +08:00
|
|
|
if (is_hpc) {
|
|
|
|
|
int skip_len = 1;
|
|
|
|
|
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
|
|
|
|
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
|
|
|
|
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
|
|
|
|
break;
|
|
|
|
|
i += skip_len - 1; // put $i at the end of the current homopolymer run
|
|
|
|
|
}
|
|
|
|
|
tq_push(&tq, skip_len);
|
|
|
|
|
kmer_span += skip_len;
|
|
|
|
|
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
|
|
|
|
} else kmer_span = l + 1 < k? l + 1 : k;
|
2017-04-06 03:39:55 +08:00
|
|
|
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
|
|
|
|
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
|
|
|
|
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
|
|
|
|
z = kmer[0] < kmer[1]? 0 : 1; // strand
|
2017-10-29 07:21:10 +08:00
|
|
|
++l;
|
|
|
|
|
if (l >= k && kmer_span < 256) {
|
2017-04-07 03:37:34 +08:00
|
|
|
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
|
|
|
|
info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
|
|
|
|
}
|
2017-05-04 17:45:19 +08:00
|
|
|
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
2017-04-06 03:39:55 +08:00
|
|
|
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
2017-12-13 22:40:42 +08:00
|
|
|
if (l == w + k - 1 && min.x != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
|
2017-04-06 03:39:55 +08:00
|
|
|
for (j = buf_pos + 1; j < w; ++j)
|
2017-12-13 22:40:42 +08:00
|
|
|
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
2017-04-06 03:39:55 +08:00
|
|
|
for (j = 0; j < buf_pos; ++j)
|
2017-12-13 22:40:42 +08:00
|
|
|
if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, km, *p, buf[j]);
|
2017-04-06 03:39:55 +08:00
|
|
|
}
|
|
|
|
|
if (info.x <= min.x) { // a new minimum; then write the old min
|
2017-12-13 22:40:42 +08:00
|
|
|
if (l >= w + k && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
2017-04-06 03:39:55 +08:00
|
|
|
min = info, min_pos = buf_pos;
|
|
|
|
|
} else if (buf_pos == min_pos) { // old min has moved outside the window
|
2017-12-13 22:40:42 +08:00
|
|
|
if (l >= w + k - 1 && min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min);
|
2017-04-06 03:39:55 +08:00
|
|
|
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
|
|
|
|
if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
|
|
|
|
for (j = 0; j <= buf_pos; ++j)
|
|
|
|
|
if (min.x >= buf[j].x) min = buf[j], min_pos = j;
|
2017-12-13 22:40:42 +08:00
|
|
|
if (l >= w + k - 1 && min.x != UINT64_MAX) { // write identical k-mers
|
2017-04-06 03:39:55 +08:00
|
|
|
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
2017-12-13 22:40:42 +08:00
|
|
|
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
2017-04-06 03:39:55 +08:00
|
|
|
for (j = 0; j <= buf_pos; ++j)
|
2017-12-13 22:40:42 +08:00
|
|
|
if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, km, *p, buf[j]);
|
2017-04-06 03:39:55 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (++buf_pos == w) buf_pos = 0;
|
|
|
|
|
}
|
|
|
|
|
if (min.x != UINT64_MAX)
|
2017-04-07 03:37:34 +08:00
|
|
|
kv_push(mm128_t, km, *p, min);
|
2017-04-06 03:39:55 +08:00
|
|
|
}
|
2022-10-22 07:07:28 +08:00
|
|
|
|
|
|
|
|
void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
|
|
|
|
{
|
|
|
|
|
uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0};
|
|
|
|
|
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
|
|
|
|
tiny_queue_t tq;
|
|
|
|
|
|
|
|
|
|
assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
|
|
|
|
memset(&tq, 0, sizeof(tiny_queue_t));
|
|
|
|
|
kv_resize(mm128_t, km, *p, p->n + len/(k - smer));
|
|
|
|
|
|
|
|
|
|
for (i = l = buf_pos = min_pos = 0; i < len; ++i) {
|
|
|
|
|
int c = seq_nt4_table[(uint8_t)str[i]];
|
|
|
|
|
if (c < 4) { // not an ambiguous base
|
|
|
|
|
int z;
|
|
|
|
|
if (is_hpc) {
|
|
|
|
|
int skip_len = 1;
|
|
|
|
|
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) {
|
|
|
|
|
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
|
|
|
|
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
|
|
|
|
break;
|
|
|
|
|
i += skip_len - 1; // put $i at the end of the current homopolymer run
|
|
|
|
|
}
|
|
|
|
|
tq_push(&tq, skip_len);
|
|
|
|
|
kmer_span += skip_len;
|
|
|
|
|
if (tq.count > k) kmer_span -= tq_shift(&tq);
|
|
|
|
|
} else kmer_span = l + 1 < k? l + 1 : k;
|
|
|
|
|
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
|
|
|
|
kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer
|
|
|
|
|
if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand
|
|
|
|
|
z = kmer[0] < kmer[1]? 0 : 1; // strand
|
|
|
|
|
++l;
|
|
|
|
|
if (l >= k && kmer_span < 256) {
|
|
|
|
|
uint64_t x, min = UINT64_MAX;
|
|
|
|
|
x = hash64(kmer[z], mask);
|
2022-10-22 09:00:22 +08:00
|
|
|
for (j = 0; j <= k - smer; ++j) {
|
2022-10-22 07:07:28 +08:00
|
|
|
uint64_t y = x >> (j + j) & smask;
|
|
|
|
|
min = min < y? min : y;
|
|
|
|
|
}
|
|
|
|
|
if ((x & smask) == min) {
|
|
|
|
|
mm128_t t;
|
|
|
|
|
t.x = x << 8 | kmer_span;
|
|
|
|
|
t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z;
|
|
|
|
|
kv_push(mm128_t, km, *p, t);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p)
|
|
|
|
|
{
|
|
|
|
|
if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p);
|
|
|
|
|
else mm_sketch(km, str, len, w, k, rid, is_hpc, p);
|
|
|
|
|
}
|