minimap2/sketch.c

170 lines
6.0 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#define __STDC_LIMIT_MACROS
#include "kvec.h"
#include "mmpriv.h"
unsigned char seq_nt4_table[256] = {
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
static inline uint64_t hash64(uint64_t key, uint64_t mask)
{
key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
key = key ^ key >> 24;
key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
key = key ^ key >> 14;
key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
key = key ^ key >> 28;
key = (key + (key << 31)) & mask;
return key;
}
typedef struct
{ // a simplified version of kdq
int front, count;
int a[32];
} tiny_queue_t;
static inline void tq_push(tiny_queue_t *q, int x)
{
q->a[((q->count++) + q->front) & 0x1f] = x;
}
static inline int tq_shift(tiny_queue_t *q)
{
int x;
if (q->count == 0)
return -1;
x = q->a[q->front++];
q->front &= 0x1f;
--q->count;
return x;
}
/**
* Find symmetric (w,k)-minimizers on a DNA sequence
*
* @param km thread-local memory pool; using NULL falls back to malloc()
* @param str DNA sequence
* @param len length of $str
* @param w find a minimizer for every $w consecutive k-mers
* @param k k-mer size
* @param rid reference ID; will be copied to the output $p array
* @param is_hpc homopolymer-compressed or not
* @param p minimizers
* p->a[i].x = kMer<<8 | kmerSpan
* p->a[i].y = rid<<32 | lastPos<<1 | strand
* where lastPos is the position of the last base of the i-th minimizer,
* and strand indicates whether the minimizer comes from the top or the bottom strand.
* Callers may want to set "p->n = 0"; otherwise results are appended to p
*/
void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p)
{
uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = {0, 0};
int i, j, l, buf_pos, min_pos, kmer_span = 0;
mm128_t buf[256], min = {UINT64_MAX, UINT64_MAX};
tiny_queue_t tq;
assert(len > 0 && (w > 0 && w < 256) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice
memset(buf, 0xff, w * 16);
memset(&tq, 0, sizeof(tiny_queue_t));
kv_resize(mm128_t, km, *p, p->n + len / w); // 扩充p将新生成len/w个minimizer
for (i = l = buf_pos = min_pos = 0; i < len; ++i)
{
int c = seq_nt4_table[(uint8_t)str[i]];
mm128_t info = {UINT64_MAX, UINT64_MAX};
if (c < 4)
{ // not an ambiguous base
int z;
if (is_hpc)
{
int skip_len = 1;
if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c)
{
for (skip_len = 2; i + skip_len < len; ++skip_len)
if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c)
break;
i += skip_len - 1; // put $i at the end of the current homopolymer run
}
tq_push(&tq, skip_len);
kmer_span += skip_len;
if (tq.count > k)
kmer_span -= tq_shift(&tq);
}
else
kmer_span = l + 1 < k ? l + 1 : k;
kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer
kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer
if (kmer[0] == kmer[1])
continue; // skip "symmetric k-mers" as we don't know it strand
z = kmer[0] < kmer[1] ? 0 : 1; // strand // 选取小的那个kmerkmer的strand到底是什么意思为什么通过比较就能确定正反
++l;
if (l >= k && kmer_span < 256)
{
info.x = hash64(kmer[z], mask) << 8 | kmer_span;
info.y = (uint64_t)rid << 32 | (uint32_t)i << 1 | z;
}
}
else
l = 0, tq.count = tq.front = 0, kmer_span = 0;
buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below
if (l == w + k - 1 && min.x != UINT64_MAX)
{ // special case for the first window - because identical k-mers are not stored yet
for (j = buf_pos + 1; j < w; ++j)
if (min.x == buf[j].x && buf[j].y != min.y)
kv_push(mm128_t, km, *p, buf[j]);
for (j = 0; j < buf_pos; ++j)
if (min.x == buf[j].x && buf[j].y != min.y)
kv_push(mm128_t, km, *p, buf[j]);
}
if (info.x <= min.x)
{ // a new minimum; then write the old min
if (l >= w + k && min.x != UINT64_MAX)
kv_push(mm128_t, km, *p, min);
min = info, min_pos = buf_pos;
}
else if (buf_pos == min_pos)
{ // old min has moved outside the window
if (l >= w + k - 1 && min.x != UINT64_MAX)
kv_push(mm128_t, km, *p, min);
for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers
if (min.x >= buf[j].x)
min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer
for (j = 0; j <= buf_pos; ++j)
if (min.x >= buf[j].x)
min = buf[j], min_pos = j; // 如果有多个min相同取离当前位置最近的
if (l >= w + k - 1 && min.x != UINT64_MAX) // 往回找相同值的kmer放进p里
{ // write identical k-mers
for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted
if (min.x == buf[j].x && min.y != buf[j].y)
kv_push(mm128_t, km, *p, buf[j]);
for (j = 0; j <= buf_pos; ++j)
if (min.x == buf[j].x && min.y != buf[j].y)
kv_push(mm128_t, km, *p, buf[j]);
}
}
if (++buf_pos == w)
buf_pos = 0;
}
if (min.x != UINT64_MAX)
kv_push(mm128_t, km, *p, min);
}