170 lines
6.0 KiB
C
170 lines
6.0 KiB
C
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <assert.h>
|
||
#include <string.h>
|
||
#define __STDC_LIMIT_MACROS
|
||
#include "kvec.h"
|
||
#include "mmpriv.h"
|
||
|
||
unsigned char seq_nt4_table[256] = {
|
||
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
|
||
|
||
static inline uint64_t hash64(uint64_t key, uint64_t mask)
|
||
{
|
||
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
|
||
key = key ^ key >> 24;
|
||
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
|
||
key = key ^ key >> 14;
|
||
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
|
||
key = key ^ key >> 28;
|
||
key = (key + (key << 31)) & mask;
|
||
return key;
|
||
}
|
||
|
||
typedef struct
|
||
{ // a simplified version of kdq
|
||
int front, count;
|
||
int a[32];
|
||
} tiny_queue_t;
|
||
|
||
static inline void tq_push(tiny_queue_t *q, int x)
|
||
{
|
||
q->a[((q->count++) + q->front) & 0x1f] = x;
|
||
}
|
||
|
||
static inline int tq_shift(tiny_queue_t *q)
|
||
{
|
||
int x;
|
||
if (q->count == 0)
|
||
return -1;
|
||
x = q->a[q->front++];
|
||
q->front &= 0x1f;
|
||
--q->count;
|
||
return x;
|
||
}
|
||
|
||
/**
|
||
* Find symmetric (w,k)-minimizers on a DNA sequence
|
||
*
|
||
* @param km thread-local memory pool; using NULL falls back to malloc()
|
||
* @param str DNA sequence
|
||
* @param len length of $str
|
||
* @param w find a minimizer for every $w consecutive k-mers
|
||
* @param k k-mer size
|
||
* @param rid reference ID; will be copied to the output $p array
|
||
* @param is_hpc homopolymer-compressed or not
|
||
* @param p minimizers
|
||
* p->a[i].x = kMer<<8 | kmerSpan
|
||
* p->a[i].y = rid<<32 | lastPos<<1 | strand
|
||
* where lastPos is the position of the last base of the i-th minimizer,
|
||
* and strand indicates whether the minimizer comes from the top or the bottom strand.
|
||
* Callers may want to set "p->n = 0"; otherwise results are appended to p
|
||
*/
|
||
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
|
||
{
|
||
uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = {0, 0};
|
||
int i, j, l, buf_pos, min_pos, kmer_span = 0;
|
||
mm128_t buf[256], min = {UINT64_MAX, UINT64_MAX};
|
||
tiny_queue_t tq;
|
||
|
||
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
|
||
memset(buf, 0xff, w * 16);
|
||
memset(&tq, 0, sizeof(tiny_queue_t));
|
||
kv_resize(mm128_t, km, *p, p->n + len / w); // 扩充p,将新生成len/w个minimizer
|
||
|
||
for (i = l = buf_pos = min_pos = 0; i < len; ++i)
|
||
{
|
||
int c = seq_nt4_table[(uint8_t)str[i]];
|
||
mm128_t info = {UINT64_MAX, UINT64_MAX};
|
||
if (c < 4)
|
||
{ // not an ambiguous base
|
||
int z;
|
||
if (is_hpc)
|
||
{
|
||
int skip_len = 1;
|
||
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c)
|
||
{
|
||
for (skip_len = 2; i + skip_len < len; ++skip_len)
|
||
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
|
||
break;
|
||
i += skip_len - 1; // put $i at the end of the current homopolymer run
|
||
}
|
||
tq_push(&tq, skip_len);
|
||
kmer_span += skip_len;
|
||
if (tq.count > k)
|
||
kmer_span -= tq_shift(&tq);
|
||
}
|
||
else
|
||
kmer_span = l + 1 < k ? l + 1 : k;
|
||
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
|
||
kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer
|
||
if (kmer[0] == kmer[1])
|
||
continue; // skip "symmetric k-mers" as we don't know it strand
|
||
z = kmer[0] < kmer[1] ? 0 : 1; // strand // 选取小的那个kmer,kmer的strand到底是什么意思?为什么通过比较就能确定正反?
|
||
++l;
|
||
if (l >= k && kmer_span < 256)
|
||
{
|
||
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
|
||
info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
|
||
}
|
||
}
|
||
else
|
||
l = 0, tq.count = tq.front = 0, kmer_span = 0;
|
||
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
|
||
if (l == w + k - 1 && min.x != UINT64_MAX)
|
||
{ // special case for the first window - because identical k-mers are not stored yet
|
||
for (j = buf_pos + 1; j < w; ++j)
|
||
if (min.x == buf[j].x && buf[j].y != min.y)
|
||
kv_push(mm128_t, km, *p, buf[j]);
|
||
for (j = 0; j < buf_pos; ++j)
|
||
if (min.x == buf[j].x && buf[j].y != min.y)
|
||
kv_push(mm128_t, km, *p, buf[j]);
|
||
}
|
||
if (info.x <= min.x)
|
||
{ // a new minimum; then write the old min
|
||
if (l >= w + k && min.x != UINT64_MAX)
|
||
kv_push(mm128_t, km, *p, min);
|
||
min = info, min_pos = buf_pos;
|
||
}
|
||
else if (buf_pos == min_pos)
|
||
{ // old min has moved outside the window
|
||
if (l >= w + k - 1 && min.x != UINT64_MAX)
|
||
kv_push(mm128_t, km, *p, min);
|
||
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
|
||
if (min.x >= buf[j].x)
|
||
min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
|
||
for (j = 0; j <= buf_pos; ++j)
|
||
if (min.x >= buf[j].x)
|
||
min = buf[j], min_pos = j; // 如果有多个min相同,取离当前位置最近的
|
||
if (l >= w + k - 1 && min.x != UINT64_MAX) // 往回找相同值的kmer,放进p里
|
||
{ // write identical k-mers
|
||
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
|
||
if (min.x == buf[j].x && min.y != buf[j].y)
|
||
kv_push(mm128_t, km, *p, buf[j]);
|
||
for (j = 0; j <= buf_pos; ++j)
|
||
if (min.x == buf[j].x && min.y != buf[j].y)
|
||
kv_push(mm128_t, km, *p, buf[j]);
|
||
}
|
||
}
|
||
if (++buf_pos == w)
|
||
buf_pos = 0;
|
||
}
|
||
if (min.x != UINT64_MAX)
|
||
kv_push(mm128_t, km, *p, min);
|
||
}
|